diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1aa9271..76153e0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,14 +9,9 @@ jobs: build: strategy: matrix: - python: ["3.7", "3.10"] + python: ["3.10", "3.13"] extras: ["", "--all-extras"] - include: - - python: "3.7" - os: ubuntu-22.04 - - python: "3.10" - os: ubuntu-latest - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -24,10 +19,10 @@ jobs: python-version: ${{ matrix.python }} - name: Install uv uses: astral-sh/setup-uv@v5 - - run: uv sync ${{ matrix.extras }} + - run: uv sync ${{ matrix.extras }} --python ${{ matrix.python }} - run: uv pip install codecov - run: uv run coverage run -m unittest nolds.test_measures - run: uv run codecov env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - if: ${{ matrix.python == '3.10' && matrix.extras != '' }} + if: ${{ matrix.python == '3.13' && matrix.extras != '' }} diff --git a/.vscode/settings.json b/.vscode/settings.json index dbafbbd..ee6b32c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,16 +7,16 @@ "python.testing.unittestArgs": [ "-v", "-s", - "./nolds", + ".", "-p", "test_*.py" ], "[python]": { - "editor.formatOnSave": false, // disable format on save for now to not mess up any files that haven't been converted yet + "editor.formatOnSave": true, "editor.defaultFormatter": "charliermarsh.ruff", "editor.codeActionsOnSave": { - "source.fixAll": "never", - "source.organizeImports": "never" + "source.fixAll": "explicit", + "source.organizeImports": "explicit" } } } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f666c12..1c3e93d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,13 +7,27 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] ### Added + +* Type hints for the complete API. +* Documentation for test hypotheses. + ### Changed * Switches from using `setup.py` to `pyproject.toml` using `uv`. * Moves ruff config into `pyproject.toml`. +* CSV-based datasets are now loaded with `csv.reader`. +* Datasets that are available as global variables are loaded lazily now. +* `datasets.qrandom` output now has a more accurate dtype of `np.uint16`. +* Uses `importlib.resources.files` instead of deprecated `pkg_resources.resource_stream`. +* Applies dtype `float64` or `int32` in all internal conversions using `asarray` or `array`. +* Applies Ruff formatting throughout the codebase and addresses all linting errors. +* Switches from two spaces to four spaces for indentation. +* Supported range of Python versions is now 3.10–3.13. Updated CI accordingly. ### Fixed +* Swaps deprecated `pkg_resources` for `importlib.resources`. + ## [0.6.2] ### Fixed diff --git a/doc/source/conf.py b/doc/source/conf.py index 345747a..ace8ab5 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # Nolds documentation build configuration file, created by # sphinx-quickstart on Wed Aug 10 17:47:20 2016. @@ -13,71 +12,71 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../..')) +# sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'Nolds' -copyright = u'2016-2024, Christopher Schölzel' -author = u'Christopher Schölzel' +project = "Nolds" +copyright = "2016-2024, Christopher Schölzel" +author = "Christopher Schölzel" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.6' +version = "0.6" # The full version, including alpha/beta/rc tags. -release = '0.6.2' +release = "0.6.2" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -85,27 +84,27 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -115,33 +114,33 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - "page_width": "1100px" + "page_width": "1100px", } # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -151,109 +150,105 @@ # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'Noldsdoc' +htmlhelp_basename = "Noldsdoc" # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Nolds.tex', 'Nolds Documentation', - u'Christopher Schölzel', 'manual'), + (master_doc, "Nolds.tex", "Nolds Documentation", "Christopher Schölzel", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -261,12 +256,11 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'nolds', 'Nolds Documentation', - [author], 1) + (master_doc, "nolds", "Nolds Documentation", [author], 1), ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -275,21 +269,27 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Nolds', 'Nolds Documentation', - author, 'Nolds', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "Nolds", + "Nolds Documentation", + author, + "Nolds", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False -# autodoc_mock_imports = ['numpy', 'future', 'setuptools', 'builtins'] \ No newline at end of file +# autodoc_mock_imports = ['numpy', 'future', 'setuptools', 'builtins'] diff --git a/nolds/__init__.py b/nolds/__init__.py index e542d1f..4ebe796 100644 --- a/nolds/__init__.py +++ b/nolds/__init__.py @@ -1,6 +1,63 @@ -from .measures import lyap_r, lyap_e, sampen, hurst_rs, corr_dim, dfa, \ - binary_n, logarithmic_n, logarithmic_r, expected_h, logmid_n, expected_rs, \ - lyap_r_len, lyap_e_len, rowwise_chebyshev, rowwise_euclidean, mfhurst_b, \ - mfhurst_dm -from .datasets import brown72, tent_map, logistic_map, fbm, fgn, qrandom, \ - load_qrandom, load_financial, barabasi1991_fractal +"""Main module for nolds, containing all important user-facing API elements.""" + +from .datasets import ( + barabasi1991_fractal, + brown72, + fbm, + fgn, + load_financial, + load_qrandom, + logistic_map, + qrandom, + tent_map, +) +from .measures import ( + binary_n, + corr_dim, + dfa, + expected_h, + expected_rs, + hurst_rs, + logarithmic_n, + logarithmic_r, + logmid_n, + lyap_e, + lyap_e_len, + lyap_r, + lyap_r_len, + mfhurst_b, + mfhurst_dm, + rowwise_chebyshev, + rowwise_euclidean, + sampen, +) + +__all__ = [ + "barabasi1991_fractal", + "binary_n", + "brown72", + "corr_dim", + "dfa", + "expected_h", + "expected_rs", + "fbm", + "fgn", + "hurst_rs", + "load_financial", + "load_qrandom", + "logarithmic_n", + "logarithmic_r", + "logistic_map", + "logmid_n", + "lyap_e", + "lyap_e_len", + "lyap_r", + "lyap_r_len", + "mfhurst_b", + "mfhurst_dm", + "qrandom", + "rowwise_chebyshev", + "rowwise_euclidean", + "sampen", + "tent_map", +] diff --git a/nolds/datasets.py b/nolds/datasets.py index 65489a9..6dc37d0 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -1,468 +1,514 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) -import numpy as np -import pkg_resources +"""Contains functions to load example datasets used in nolds.""" + +from __future__ import annotations + +import csv import datetime +import importlib.resources +import itertools +from typing import IO, TYPE_CHECKING, Any + +import numpy as np + +if TYPE_CHECKING: + from collections.abc import Generator + + +def lorenz_euler( + length: int, + sigma: float, + rho: float, + beta: float, + dt: float = 0.01, + start: list[float] | None = None, +) -> np.ndarray[tuple[int, int], np.dtype[np.float32]]: + """Simulates the Lorenz system using a simple Euler method. + + The Lorenz system is a three dimensional dynamical system given + by the following equations: + + dx/dt = sigma * (y - x) + dy/dt = rho * x - y - x * z + dz/dt = x * y - beta * z + + Args: + length: Number of data points to generate. + sigma: Sigma parameter of the Lorenz system. + rho: Rho parameter of the Lorenz system. + beta: Beta parameter of the Lorenz system. + dt: Time delta between two data points. + start: Optional starting point for the trajectory. + + Returns: + 2d-array of (x, y, z) data points in the simulated Lorenz system. + """ + if start is None: + start = [1, 1, 1] + + def lorenz( + state: np.ndarray[tuple[int], np.dtype[np.float32]], sigma: float, rho: float, beta: float + ) -> np.ndarray[tuple[int], np.dtype[np.float32]]: + x, y, z = state + # NOTE: Numpy 1.x stores intermediate results as float64 + # => to achieve consistency between numpy versions, we have to use + # float32 for all values that enter the formula to simulate numpy 1.x + # behavior with numpy 2.x. + return np.array( + [ + np.float32(sigma) * (y - x), + np.float32(rho) * x - y - x * z, + x * y - np.float32(beta) * z, + ], + dtype=np.float32, + ) + + trajectory = np.zeros((length, 3), dtype=np.float32) + trajectory[0] = start + for i in range(1, length): + trajectory[i] = trajectory[i - 1] + lorenz(trajectory[i - 1], sigma, rho, beta) * dt + return trajectory + + +def lorenz_lyap(sigma: float, rho: float, beta: float) -> float: + """Calculate the exact Lyapunov dimension of the Lorenz system. + + This uses the definition according to Leonov 2015 [ll_1]_. + + Args: + sigma: Sigma parameter of the Lorenz system. + rho: Rho parameter of the Lorenz system. + beta: Beta parameter of the Lorenz system. + + Returns: + Prescribed Lyapunov dimension for the Lorenz system according to Leonov 2015. + + References: + .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the + analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, + vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. + """ + return 3 - 2 * (sigma + beta + 1) / (sigma + 1 + np.sqrt((sigma - 1) ** 2 + 4 * sigma * rho)) + + +def fbm( + n: int, + H: float = 0.75, + random_seed: int | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + """Generates fractional brownian motions of desired length. + + Author: + Christian Thomae + + References: + .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation + + Args: + n: Length of sequence to generate. + H: Hurst parameter. + random_seed: Seed used for random number generation. + + Returns: + Simulated fractional brownian motion + """ + if H < 0 or H > 1: + msg = f"H must be between 0 and 1, got {H} instead." + raise ValueError(msg) + + def R( # noqa: N802 + t: np.ndarray[tuple[int], np.dtype[np.float64]], + s: np.ndarray[tuple[int], np.dtype[np.float64]], + ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + twoH = 2 * H + return 0.5 * (s**twoH + t**twoH - np.abs(t - s) ** twoH) + + # form the matrix tau + gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix + w, P = np.linalg.eigh(gamma) + L = np.diag(w) + sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) + gen = np.random.default_rng(seed=random_seed) + v = gen.standard_normal(n) + return np.dot(sigma, v) + + +def fgn( + n: int, + H: float = 0.75, + random_seed: int | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + """Generates fractional gaussian noise of desired length. + + References: + .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion + + Args: + n: Length of sequence to generate. + H: Hurst parameter. + random_seed: Seed used for random number generation. + + Returns: + Simulated fractional gaussian noise + """ + return np.diff(fbm(n + 1, H=H, random_seed=random_seed)) + + +def qrandom(n: int) -> np.ndarray[tuple[int], np.dtype[np.uint16]]: + """Creates an array of n true random numbers. + + The data is obtained from the quantum random number generator at qrng.anu.edu.au. + + This function requires the package quantumrandom and an internet connection. + + Args: + n: length of the random array + + Return: + Array of truly random unsigned 16 bit int values + """ + import quantumrandom + + return np.concatenate( + [ + np.array(quantumrandom.get_data(data_type="uint16", array_length=1024), dtype=np.uint16) + for i in range(int(np.ceil(n / 1024.0))) + ] + )[:n] + + +def load_qrandom() -> np.ndarray[tuple[int], np.dtype[np.int32]]: + """Loads a set of 10000 random numbers generated by qrandom. + + This dataset can be used when you want to do some limited tests with "true" + random data without an internet connection. + + Returns: + Dataset of 10k quantum random numbers packaged with nolds. + """ + fname = "datasets/qrandom.npy" + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: + return np.load(f) + + +def load_brown72() -> np.ndarray[tuple[int], np.dtype[np.float64]]: + """Returns the dataset brown72 with a prescribed Hurst exponent of 0.72. + + Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html + """ + fname = "datasets/brown72.npy" + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: + return np.load(f) + + +def load_lorenz_physionet() -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int, int], np.dtype[np.float64]] +]: + """Loads a dataset of the Lorenz system (X variable) and PhysioNet's DFA output on that dataset. + + The input data was created with the following code: + + data = datasets.lorenz_euler( + 3000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.012 + )[1000:,0] + + The ouptut from PhysioNet was created by calling: + + dfa < lorenz.txt > lorenz_physionet.txt + + Returns: + Tuple containing + - time series of the X variable of the Lorenz system that was used as input + - x- and y-coordinates of the line fitting step in the PhysioNet output + """ + fname = "datasets/lorenz.txt" + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: + data_in = np.loadtxt(f) + fname = "datasets/lorenz_physionet.txt" + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: + data_out = np.loadtxt(f) + return data_in, data_out -def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=[1,1,1]): - """ - Simulates the Lorenz system using a simple Euler method - - The Lorenz system is a three dimensional dynamical system given - by the following equations: - - dx/dt = sigma * (y - x) - dy/dt = rho * x - y - x * z - dz/dt = x * y - beta * z - """ - def lorenz(state, sigma, rho, beta): - x, y, z = state - # NOTE: Numpy 1.x stores intermediate results as float64 - # => to achieve consistency between numpy versions, we have to use - # float32 for all values that enter the formula to simulate numpy 1.x - # behavior with numpy 2.x. - return np.array([ - np.float32(sigma) * (y - x), - np.float32(rho) * x - y - x * z, - x * y - np.float32(beta) * z - ], dtype="float32") - trajectory = np.zeros((length, 3), dtype="float32") - trajectory[0] = start - for i in range(1, length): - # t = i * dt - trajectory[i] = trajectory[i-1] + lorenz(trajectory[i-1], sigma, rho, beta) * dt - return trajectory - -def lorenz_lyap(sigma, rho, beta): - """ - Calculates the exact Lyapunov dimension of the Lorenz system according to - Leonov 2015 [ll_1]_. - - References: - .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the - analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, - vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. - """ - return 3 - 2 * (sigma + beta + 1) / (sigma + 1 + np.sqrt((sigma-1) ** 2 + 4 * sigma * rho)) - - -def fbm(n, H=0.75): - """ - Generates fractional brownian motions of desired length. - - Author: - Christian Thomae - - References: - .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation - - Args: - n (int): - length of sequence to generate - Kwargs: - H (float): - hurst parameter - - Returns: - array of float: - simulated fractional brownian motion - """ - # TODO more detailed description of fbm - assert H > 0 and H < 1 - - def R(t, s): - twoH = 2 * H - return 0.5 * (s**twoH + t**twoH - np.abs(t - s)**twoH) - # form the matrix tau - gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix - w, P = np.linalg.eigh(gamma) - L = np.diag(w) - sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) - v = np.random.randn(n) - return np.dot(sigma, v) - - -def fgn(n, H=0.75): - """ - Generates fractional gaussian noise of desired length. - - References: - .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion - - Args: - n (int): - length of sequence to generate - - Kwargs: - H (float): - hurst parameter - - Returns: - array of float: - simulated fractional gaussian noise - """ - return np.diff(fbm(n+1, H=H)) - - -def qrandom(n): - """ - Creates an array of n true random numbers obtained from the quantum random - number generator at qrng.anu.edu.au - - This function requires the package quantumrandom and an internet connection. - - Args: - n (int): - length of the random array - - Return: - array of ints: - array of truly random unsigned 16 bit int values - """ - import quantumrandom - return np.concatenate([ - quantumrandom.get_data(data_type='uint16', array_length=1024) - for i in range(int(np.ceil(n/1024.0))) - ])[:n] - - -def load_qrandom(): - """ - Loads a set of 10000 random numbers generated by qrandom. - - This dataset can be used when you want to do some limited tests with "true" - random data without an internet connection. - - Returns: - int array - the dataset - """ - fname = "datasets/qrandom.npy" - with pkg_resources.resource_stream(__name__, fname) as f: - return np.load(f) - - -def load_brown72(): - """ - Loads the dataset brown72 with a prescribed Hurst exponent of 0.72 - - Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html - - Returns: - float array: - the dataset - """ - fname = "datasets/brown72.npy" - with pkg_resources.resource_stream(__name__, fname) as f: - return np.load(f) - - -def load_lorenz_physionet(): - """ - Loads a dataset containing the X variable of the Lorenz system - as well as the output of PhysioNet's dfa implementation on that dataset. - - The input data was created with the following code: - - data = datasets.lorenz_euler( - 3000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.012 - )[1000:,0] - - The ouptut from PhysioNet was created by calling: - - dfa < lorenz.txt > lorenz_physionet.txt - - Returns: - 1d float array: - time series of the X variable of the Lorenz system that was used as input - 2d float array: - x- and y-coordinates of the line fitting step in the PhysioNet output - """ - fname = "datasets/lorenz.txt" - with pkg_resources.resource_stream(__name__, fname) as f: - data_in = np.loadtxt(f) - fname = "datasets/lorenz_physionet.txt" - with pkg_resources.resource_stream(__name__, fname) as f: - data_out = np.loadtxt(f) - return data_in, data_out +def tent_map(x: float, steps: int, mu: float = 2) -> Generator[float, None, None]: + """Generates a time series of the tent map. -def tent_map(x, steps, mu=2): - """ - Generates a time series of the tent map. + Characteristics and Background: + The name of the tent map is derived from the fact that the plot of x_i vs + x_i+1 looks like a tent. For mu > 1 one application of the mapping function + can be viewed as stretching the surface on which the value is located and + then folding the area that is greater than one back towards the zero. This + corresponds nicely to the definition of chaos as expansion in one dimension + which is counteracted by a compression in another dimension. - Characteristics and Background: - The name of the tent map is derived from the fact that the plot of x_i vs - x_i+1 looks like a tent. For mu > 1 one application of the mapping function - can be viewed as stretching the surface on which the value is located and - then folding the area that is greater than one back towards the zero. This - corresponds nicely to the definition of chaos as expansion in one dimension - which is counteracted by a compression in another dimension. - - Calculating the Lyapunov exponent: - The lyapunov exponent of the tent map can be easily calculated as due to - this stretching behavior a small difference delta between two neighboring - points will indeed grow exponentially by a factor of mu in each iteration. - We thus can assume that: + Calculating the Lyapunov exponent: + The lyapunov exponent of the tent map can be easily calculated as due to + this stretching behavior a small difference delta between two neighboring + points will indeed grow exponentially by a factor of mu in each iteration. + We thus can assume that: - delta_n = delta_0 * mu^n + delta_n = delta_0 * mu^n - We now only have to change the basis to e to obtain the exact formula that - is used for the definition of the lyapunov exponent: + We now only have to change the basis to e to obtain the exact formula that + is used for the definition of the lyapunov exponent: - delta_n = delta_0 * e^(ln(mu) * n) + delta_n = delta_0 * e^(ln(mu) * n) - Therefore the lyapunov exponent of the tent map is: + Therefore the lyapunov exponent of the tent map is: - lambda = ln(mu) + lambda = ln(mu) - References: - .. [tm_1] https://en.wikipedia.org/wiki/Tent_map + References: + .. [tm_1] https://en.wikipedia.org/wiki/Tent_map - Args: - x (float): - starting point - steps (int): - number of steps for which the generator should run + Args: + x: starting point + steps: number of steps for which the generator should run + mu: parameter mu that controls the behavior of the map - Kwargs: - mu (int): - parameter mu that controls the behavior of the map + Yields: + The next value in the tent map series. + """ + for _ in range(steps): + x = mu * x if x < 0.5 else mu * (1 - x) # noqa: PLR2004 + yield x - Returns: - generator object: - the generator that creates the time series - """ - for _ in range(steps): - x = mu * x if x < 0.5 else mu * (1 - x) - yield x -# TODO should all math be formatted like this, or should the documentation of -# logistic_map revert to a version that is more readable as plain text +def logistic_map(x: float, steps: int, r: float = 4) -> Generator[float, None, None]: + r"""Generates a time series of the logistic map. + Characteristics and Background: + The logistic map is among the simplest examples for a time series that can + exhibit chaotic behavior depending on the parameter r. For r between 2 and + 3, the series quickly becomes static. At r=3 the first bifurcation point is + reached after which the series starts to oscillate. Beginning with r = 3.6 + it shows chaotic behavior with a few islands of stability until perfect + chaos is achieved at r = 4. -def logistic_map(x, steps, r=4): - r""" - Generates a time series of the logistic map. + Calculating the Lyapunov exponent: + To calculate the "true" Lyapunov exponent of the logistic map, we first + have to make a few observations for maps in general that are repeated + applications of a function to a starting value. - Characteristics and Background: - The logistic map is among the simplest examples for a time series that can - exhibit chaotic behavior depending on the parameter r. For r between 2 and - 3, the series quickly becomes static. At r=3 the first bifurcation point is - reached after which the series starts to oscillate. Beginning with r = 3.6 - it shows chaotic behavior with a few islands of stability until perfect - chaos is achieved at r = 4. + If we have two starting values that differ by some infinitesimal + :math:`delta_0` then according to the definition of the lyapunov exponent + we will have an exponential divergence: - Calculating the Lyapunov exponent: - To calculate the "true" Lyapunov exponent of the logistic map, we first - have to make a few observations for maps in general that are repeated - applications of a function to a starting value. + .. math:: + |\delta_n| = |\delta_0| e^{\lambda n} - If we have two starting values that differ by some infinitesimal - :math:`delta_0` then according to the definition of the lyapunov exponent - we will have an exponential divergence: + We can now write that: - .. math:: - |\delta_n| = |\delta_0| e^{\lambda n} - - We can now write that: - - .. math:: - e^{\lambda n} = \lim_{\delta_0 -> 0} |\frac{\delta_n}{\delta_0}| - - This is the definition of the derivative :math:`\frac{dx_n}{dx_0}` of a - point :math:`x_n` in the time series with respect to the starting point - :math:`x_0` (or rather the absolute value of that derivative). Now we can - use the fact that due to the definition of our map as repetitive - application of some f we have: - - .. math:: - f^{n\prime}(x) = f(f(f(...f(x_0)...))) = f'(x_n-1) \cdot f'(x_n-2) - \cdot ... \cdot f'(x_0) - - with - - .. math:: - e^{\lambda n} = |f^{n\prime}(x)| - - we now have - - .. math:: - - e^{\lambda n} &= |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - \Leftrightarrow \\ - \lambda n &= \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - \Leftrightarrow \\ - \lambda &= \frac{1}{n} \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - &= \frac{1}{n} \sum_{k=0}^{n-1} \ln |f'(x_k)| - - With this sum we can now calculate the lyapunov exponent for any map. - For the logistic map we simply have to calculate :math:`f'(x)` and as we - have - - .. math:: - f(x) = r x (1-x) = rx - rx² - - we now get - - .. math:: - f'(x) = r - 2 rx - - - - References: - .. [lm_1] https://en.wikipedia.org/wiki/Tent_map - .. [lm_2] https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ - - Args: - x (float): - starting point - steps (int): - number of steps for which the generator should run - - Kwargs: - r (int): - parameter r that controls the behavior of the map - - Returns: - generator object: - the generator that creates the time series - """ - for _ in range(steps): - x = r * x * (1 - x) - yield x - - -def load_financial(): - """ - Loads the following datasets from CSV files in this package: - - - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - ndx: NASDAQ 100, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5ENDX/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - All datasets are daily prices from the period from 1990-01-01 to 2001-05-01 - missing values are NaN except for opening values which are treated as - follows: - - - If the first opening value is missing, the first *existing* opening value - is used for the first day. - - All other missing opening values are filled by the close value of the last - day where data was available. - - Returns: - list of tuple(1d-array, 2d-array): - datasets with days as array of date objects and 2d-array with the columns - "Open", "High", "Low", "Close", "Adj Close", and "Volume". Note that - "Open" values have been padded to ensure that there are no NaNs left. - """ - - def load_finance_yahoo_data(f): - f.readline() - days = [] - values = [] - for l in f: - fields = l.decode("utf-8") - fields = fields.split(",") - d = datetime.datetime.strptime(fields[0], "%Y-%m-%d") - v = [np.nan if x.strip() == "null" else float(x) for x in fields[1:]] - days.append(d) - values.append(v) - return np.array(days), np.array(values) - - def pad_opening_values(values): - # fill first value from future if required - first = 0 - while np.isnan(values[first, 0]): - first += 1 - values[0, 0] = values[first, 0] - # iterate over all indices where data is missing - for i in np.where(np.isnan(values[:, 0]))[0]: - j = i - # pad opening value with close value of previous data - while np.isnan(values[j][3]): - j -= 1 - values[i, 0] = values[j, 3] - - data = [] - for index in ["^JKSE", "^N225", "^NDX"]: - fname = "datasets/{}.csv".format(index) - with pkg_resources.resource_stream(__name__, fname) as f: - days, values = load_finance_yahoo_data(f) - pad_opening_values(values) - data.append((days, values)) - return data - - -def barabasi1991_fractal(size, iterations, b1=0.8, b2=0.5): - """ - Generates the simple fractal described in [bf]_. - - The fractal divides a rectangular segment starting at (x0, y0) with width w - and height h along the x axis into four line segments of equal size with the - boundary points [x0, x1, x2, x3, x4]. It has two parameters b1 and b2 that - allow to choose the value for y(x1) and y(x3) while it always holds that - y(x0) = y0, y(x2) = y0 and y(x4) = y0 + h. - - The process starts with a single line segment of height 1 spanning the whole - data range. In each iteration, the rectangles spanning the line segments - from the previous iteration are subdivided according to the same rule. - - References: - .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - - Args: - size (int): - number of data points in the resulting array - iterations (int): - number of iterations to perform - - Kwargs: - b1 (float): - relative height at x1 (between 0 and 1) - b2 (float): - relative height at x3 (between 0 and 1) - - Returns: - (1d-array of float): - generated fractal - """ - def b1991(x0, y0, w, h): - if h < 0: - # for a segment with negative slope we have flip the x-axis - d, nxtp = b1991(x0, y0 + h, w, -h) - return d[::-1], nxtp - x1 = x0 + w // 4 - x2 = x0 + w // 2 - x3 = x2 + w // 4 - x4 = x0 + w - data = np.zeros(w, dtype=np.float64) - data[x0 - x0:x1 - x0] = np.linspace(0, 1, x1 - x0) * b1 * h + y0 - data[x1 - x0:x2 - x0] = np.linspace(1, 0, x2 - x1) * b1 * h + y0 - data[x2 - x0:x3 - x0] = np.linspace(0, 1, x3 - x2) * b2 * h + y0 - data[x3 - x0:x4 - x0] = np.linspace(0, 1, x4 - x3) * (1 - b2) * h \ - + y0 + b2 * h - return data, [x0, x1, x2, x3, x4] - fractal = np.linspace(0, 1, size) - intervals = [(0, size)] - for _ in range(iterations): - next_intervals = [] - for x1, x2 in intervals: - d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2-1] - fractal[x1]) - fractal[x1:x2] = d - next_intervals.extend( - [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])] - ) - intervals = next_intervals - return fractal - - -brown72 = load_brown72() -jkse, n225, ndx = load_financial() + .. math:: + e^{\lambda n} = \lim_{\delta_0 -> 0} |\frac{\delta_n}{\delta_0}| + + This is the definition of the derivative :math:`\frac{dx_n}{dx_0}` of a + point :math:`x_n` in the time series with respect to the starting point + :math:`x_0` (or rather the absolute value of that derivative). Now we can + use the fact that due to the definition of our map as repetitive + application of some f we have: + + .. math:: + f^{n\prime}(x) = f(f(f(...f(x_0)...))) = f'(x_n-1) \cdot f'(x_n-2) + \cdot ... \cdot f'(x_0) + + with + + .. math:: + e^{\lambda n} = |f^{n\prime}(x)| + + we now have + + .. math:: + + e^{\lambda n} &= |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + \Leftrightarrow \\ + \lambda n &= \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + \Leftrightarrow \\ + \lambda &= \frac{1}{n} \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + &= \frac{1}{n} \sum_{k=0}^{n-1} \ln |f'(x_k)| + + With this sum we can now calculate the lyapunov exponent for any map. + For the logistic map we simply have to calculate :math:`f'(x)` and as we + have + + .. math:: + f(x) = r x (1-x) = rx - rx² + + we now get + + .. math:: + f'(x) = r - 2 rx + + + + References: + .. [lm_1] https://en.wikipedia.org/wiki/Tent_map + .. [lm_2] https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ + + Args: + x: starting point + steps: number of steps for which the generator should run + r: parameter r that controls the behavior of the map + + Yields: + The next value in the logistic map time series. + """ + for _ in range(steps): + x = r * x * (1 - x) + yield x + + +def load_financial() -> list[ + tuple[ + np.ndarray[tuple[int], np.dtype[np.datetime64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ] +]: + """Loads three financial datasets from CSV files in this package. + + - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + - ndx: NASDAQ 100, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5ENDX/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + + All datasets are daily prices from the period from 1990-01-01 to 2001-05-01 + missing values are NaN except for opening values which are treated as + follows: + + - If the first opening value is missing, the first *existing* opening value + is used for the first day. + - All other missing opening values are filled by the close value of the last + day where data was available. + + Returns: + Datasets with days as array of date objects and 2d-array with the columns + "Open", "High", "Low", "Close", "Adj Close", and "Volume". Note that + "Open" values have been padded to ensure that there are no NaNs left. + """ + + def load_finance_yahoo_data( + f: IO[str], + ) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.datetime64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ]: + days = [] + values = [] + reader = csv.reader(f, delimiter=",") + next(reader) # skip header row + for row in reader: + d = datetime.datetime.strptime(row[0], "%Y-%m-%d").astimezone(datetime.timezone.utc) + v = [np.nan if x.strip() == "null" else float(x) for x in row[1:]] + days.append(d) + values.append(v) + return np.array(days), np.array(values) + + def pad_opening_values(values: np.ndarray[tuple[int, int], np.dtype[np.float64]]) -> None: + # fill first value from future if required + first = 0 + while np.isnan(values[first, 0]): + first += 1 + values[0, 0] = values[first, 0] + # iterate over all indices where data is missing + for i in np.where(np.isnan(values[:, 0]))[0]: + j = i + # pad opening value with close value of previous data + while np.isnan(values[j][3]): + j -= 1 + values[i, 0] = values[j, 3] + + data = [] + for index in ["^JKSE", "^N225", "^NDX"]: + fname = f"datasets/{index}.csv" + with importlib.resources.files("nolds").joinpath(fname).open("r", encoding="utf-8") as f: + days, values = load_finance_yahoo_data(f) + pad_opening_values(values) + data.append((days, values)) + return data + + +def barabasi1991_fractal( + size: int, iterations: int, b1: float = 0.8, b2: float = 0.5 +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + """Generates the simple fractal described in [bf]_. + + The fractal divides a rectangular segment starting at (x0, y0) with width w + and height h along the x axis into four line segments of equal size with the + boundary points [x0, x1, x2, x3, x4]. It has two parameters b1 and b2 that + allow to choose the value for y(x1) and y(x3) while it always holds that + y(x0) = y0, y(x2) = y0 and y(x4) = y0 + h. + + The process starts with a single line segment of height 1 spanning the whole + data range. In each iteration, the rectangles spanning the line segments + from the previous iteration are subdivided according to the same rule. + + References: + .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + + Args: + size: number of data points in the resulting array + iterations: number of iterations to perform + + Kwargs: + b1: relative height at x1 (between 0 and 1) + b2: relative height at x3 (between 0 and 1) + + Returns: + The generated fractal + """ + + def b1991( + x0: int, y0: int, w: int, h: int + ) -> tuple[np.ndarray[tuple[int], np.dtype[np.float64]], list[int]]: + if h < 0: + # for a segment with negative slope we have flip the x-axis + d, nxtp = b1991(x0, y0 + h, w, -h) + return d[::-1], nxtp + x1 = x0 + w // 4 + x2 = x0 + w // 2 + x3 = x2 + w // 4 + x4 = x0 + w + data = np.zeros(w, dtype=np.float64) + data[x0 - x0 : x1 - x0] = np.linspace(0, 1, x1 - x0) * b1 * h + y0 + data[x1 - x0 : x2 - x0] = np.linspace(1, 0, x2 - x1) * b1 * h + y0 + data[x2 - x0 : x3 - x0] = np.linspace(0, 1, x3 - x2) * b2 * h + y0 + data[x3 - x0 : x4 - x0] = np.linspace(0, 1, x4 - x3) * (1 - b2) * h + y0 + b2 * h + return data, [x0, x1, x2, x3, x4] + + fractal = np.linspace(0, 1, size) + intervals = [(0, size)] + for _ in range(iterations): + next_intervals = [] + for x1, x2 in intervals: + d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2 - 1] - fractal[x1]) + fractal[x1:x2] = d + next_intervals.extend( + [(np1, np2) for np1, np2 in itertools.pairwise(nxtp)], + ) + intervals = next_intervals + return fractal + + +def __getattr__(name: str) -> Any: # noqa: ANN401 + """Provide datasets as variables via lazy evaluation. + + See https://peps.python.org/pep-0562/ for documentation of this type of + module-level __getattr__. + """ + if name == "brown72": + global brown72 # noqa: PLW0603 + brown72 = load_brown72() + return brown72 + if name in ["jkse", "n225", "ndx"]: + global jkse, n225, ndx + jkse, n225, ndx = load_financial() + match name: + case "jkse": + return jkse + case "n225": + return n225 + case "ndx": + return ndx + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/nolds/examples.py b/nolds/examples.py index b13801d..80bf29f 100644 --- a/nolds/examples.py +++ b/nolds/examples.py @@ -1,620 +1,601 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) -from . import measures as nolds -from . import datasets +"""Example use cases for measures implemented in nolds. + +The functions in this module aim to recreate experiments published in literature. +""" + +from __future__ import annotations + +import argparse +from typing import TYPE_CHECKING, Literal + import numpy as np +from . import datasets +from . import measures as nolds -def weron_2002_figure2(n=10000): - """ - Recreates figure 2 of [w]_ comparing the reported values by Weron to the - values obtained by the functions in this package. - - The experiment consists of n iterations where the hurst exponent of randomly - generated gaussian noise is calculated. This is done with differing sequence - lengths of 256, 512, 1024, ...., 65536. The average estimated hurst exponent - over all iterations is plotted for the following configurations: - - * ``weron`` is the Anis-Lloyd-corrected Hurst exponent calculated by Weron - * ``rs50`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds - with the same parameters as used by Weron - * ``weron_raw`` is the uncorrected Hurst exponent calculated by Weron - * ``rs50_raw`` is the uncorrected Hurst exponent calculated by Nolds with the - same parameters as used by Weron - * ``rsn`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds with - the default settings of Nolds - - The values reported by Weron are only measured from the plot in the PDF - version of the paper and can therefore have some small inaccuracies. - - This function requires the package ``matplotlib``. - - References: - - .. [w] R. Weron, “Estimating long-range dependence: finite sample - properties and confidence intervals,” Physica A: Statistical Mechanics - and its Applications, vol. 312, no. 1, pp. 285–299, 2002. - - Kwargs: - n (int): - number of iterations of the experiment (Weron used 10000, but this takes - a while) - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - # note: these values are calculated by measurements in inkscape of the plot - # from the paper - reported = [6.708, 13.103, 20.240, 21.924, 22.256, 24.112, 24.054, 26.299, - 26.897] - reported_raw = [160.599, 141.663, 128.454, 115.617, 103.651, 95.481, 86.810, - 81.799, 76.270] - - def height_to_h(height): - return 0.49 + height / 29.894 * 0.01 - reported = height_to_h(np.array(reported)) - reported_raw = height_to_h(np.array(reported_raw)) - data = [] - for e in range(8, 17): - l = 2**e - nvals = 2**np.arange(6, e) - rsn = np.mean([ - nolds.hurst_rs(np.random.normal(size=l), fit="poly") - for _ in range(n) - ]) - rs50 = np.mean([ - nolds.hurst_rs(np.random.normal(size=l), fit="poly", nvals=nvals) - for _ in range(n) - ]) - rs50_raw = np.mean([ - nolds.hurst_rs( - np.random.normal(size=l), fit="poly", nvals=nvals, corrected=False - ) - for _ in range(n) - ]) - data.append((rsn, rs50, rs50_raw)) - lines = plt.plot(np.arange(8, 17), data) - r = plt.plot(np.arange(8, 17), reported) - rr = plt.plot(np.arange(8, 17), reported_raw) - plt.legend(r + rr + lines, ("weron", "weron_raw", "rsn", "rs50", "rs50_raw")) - plt.xticks(np.arange(8, 17), 2**np.arange(8, 17)) - plt.xlabel("sequence length") - plt.ylabel("estimated hurst exponent") - plt.show() - - -def plot_hurst_hist(): - """ - Plots a histogram of values obtained for the hurst exponent of uniformly - distributed white noise. - - This function requires the package ``matplotlib``. - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - hs = [ - nolds.hurst_rs(np.random.random(size=10000), corrected=True) - for _ in range(100) - ] - plt.hist(hs, bins=20) - plt.xlabel("esimated value of hurst exponent") - plt.ylabel("number of experiments") - plt.show() - - -def plot_lyap(maptype="logistic"): - """ - Plots a bifurcation plot of the given map and superimposes the true - lyapunov exponent as well as the estimates of the largest lyapunov exponent - obtained by ``lyap_r`` and ``lyap_e``. The idea for this plot is taken - from [ll]_. - - This function requires the package ``matplotlib``. - - References: - - .. [ll] Manfred Füllsack, "Lyapunov exponent", - url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - - Kwargs: - maptype (str): - can be either ``"logistic"`` for the logistic map or ``"tent"`` for the - tent map. - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - - x_start = 0.1 - n = 140 - nbifur = 40 - if maptype == "logistic": - param_name = "r" - param_range = np.arange(2, 4, 0.01) - full_data = np.array([ - np.fromiter(datasets.logistic_map(x_start, n, r), dtype="float32") - for r in param_range - ]) - # It can be proven that the lyapunov exponent of the logistic map - # (or any map that is an iterative application of a function) can be - # calculated as the mean of the logarithm of the absolute of the - # derivative at the individual data points. - # For a proof see for example: - # https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ - # Derivative of logistic map: f(x) = r * x * (1 - x) = r * x - r * x² - # => f'(x) = r - 2 * r * x - lambdas = [ - np.mean(np.log(abs(r - 2 * r * x[np.where(x != 0.5)]))) - for x, r in zip(full_data, param_range) +if TYPE_CHECKING: + from nolds.measures import FloatArray1D, FloatArray2D, IntArray1D, NumberArrayLike1D + + +def weron_2002_figure2(n: int = 10000) -> None: + """Recreates figure 2 of Weron 2002 ([w]_). + + The experiment consists of n iterations where the hurst exponent of randomly + generated gaussian noise is calculated. This is done with differing sequence + lengths of 256, 512, 1024, ...., 65536. The average estimated hurst exponent + over all iterations is plotted for the following configurations: + + * ``weron`` is the Anis-Lloyd-corrected Hurst exponent calculated by Weron + * ``rs50`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds + with the same parameters as used by Weron + * ``weron_raw`` is the uncorrected Hurst exponent calculated by Weron + * ``rs50_raw`` is the uncorrected Hurst exponent calculated by Nolds with the + same parameters as used by Weron + * ``rsn`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds with + the default settings of Nolds + + The values reported by Weron are only measured from the plot in the PDF + version of the paper and can therefore have some small inaccuracies. + + This function requires the package ``matplotlib``. + + References: + + .. [w] R. Weron, “Estimating long-range dependence: finite sample + properties and confidence intervals,” Physica A: Statistical Mechanics + and its Applications, vol. 312, no. 1, pp. 285–299, 2002. + + Args: + n: number of iterations of the experiment (Weron used 10000, but this takes + a while) + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + # note: these values are calculated by measurements in inkscape of the plot + # from the paper + reported = [6.708, 13.103, 20.240, 21.924, 22.256, 24.112, 24.054, 26.299, 26.897] + reported_raw = [160.599, 141.663, 128.454, 115.617, 103.651, 95.481, 86.810, 81.799, 76.270] + + def height_to_h( + height: FloatArray1D, + ) -> FloatArray1D: + """Returns Hurst exponent for specific height coordinates in the inkscape file. + + Args: + height: The height coordinates from the inkscape file. + """ + return 0.49 + height / 29.894 * 0.01 + + reported = height_to_h(np.array(reported)) + reported_raw = height_to_h(np.array(reported_raw)) + rng = np.random.default_rng(3897509205) + data = [] + for e in range(8, 17): + length = 2**e + nvals = 2 ** np.arange(6, e) + rsn = np.mean([nolds.hurst_rs(rng.normal(size=length), fit="poly") for _ in range(n)]) + rs50 = np.mean( + [nolds.hurst_rs(rng.normal(size=length), fit="poly", nvals=nvals) for _ in range(n)] + ) + rs50_raw = np.mean( + [ + nolds.hurst_rs( + rng.normal(size=length), + fit="poly", + nvals=nvals, + corrected=False, + ) + for _ in range(n) + ] + ) + data.append((rsn, rs50, rs50_raw)) + lines = plt.plot(np.arange(8, 17), data) + r = plt.plot(np.arange(8, 17), reported) + rr = plt.plot(np.arange(8, 17), reported_raw) + plt.legend(r + rr + lines, ("weron", "weron_raw", "rsn", "rs50", "rs50_raw")) + plt.xticks(np.arange(8, 17), [str(x) for x in 2 ** np.arange(8, 17)]) + plt.xlabel("sequence length") + plt.ylabel("estimated hurst exponent") + plt.show() + + +def plot_hurst_hist() -> None: + """Plots a histogram of values obtained for the hurst exponent of white noise. + + This function requires the package ``matplotlib``. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + rng = np.random.default_rng(869879538) + hs = [nolds.hurst_rs(rng.random(size=10000), corrected=True) for _ in range(100)] + plt.hist(hs, bins=20) + plt.xlabel("esimated value of hurst exponent") + plt.ylabel("number of experiments") + plt.show() + + +def plot_lyap(maptype: Literal["logistic", "tent"] = "logistic") -> None: + """Creates a bifurcation plot of the given map and its lyapunov exponents. + + This superimposes the true lyapunov exponent as well as the estimates of the + largest lyapunov exponent obtained by ``lyap_r`` and ``lyap_e``. + The idea for this plot is taken from [ll]_. + + This function requires the package ``matplotlib``. + + References: + + .. [ll] Manfred Füllsack, "Lyapunov exponent", + url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html + + Args: + maptype: can be either ``"logistic"`` for the logistic map or ``"tent"`` for the tent map. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + x_start = 0.1 + n = 140 + nbifur = 40 + if maptype == "logistic": + param_name = "r" + param_range = np.arange(2, 4, 0.01) + full_data = np.array( + [ + np.fromiter(datasets.logistic_map(x_start, n, float(r)), dtype="float32") + for r in param_range + ] + ) + # It can be proven that the lyapunov exponent of the logistic map + # (or any map that is an iterative application of a function) can be + # calculated as the mean of the logarithm of the absolute of the + # derivative at the individual data points. + # For a proof see for example: + # https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ + # Derivative of logistic map: f(x) = r * x * (1 - x) = r * x - r * x² + # => f'(x) = r - 2 * r * x + x_0 = 0.5 # avoid zero crossings in f'(x) + lambdas = [ + np.mean(np.log(abs(r - 2 * r * x[np.where(x != x_0)]))) + for x, r in zip(full_data, param_range, strict=True) + ] + elif maptype == "tent": + param_name = "$\\mu$" + param_range = np.arange(0, 2, 0.01) + full_data: FloatArray2D = np.array( + [ + np.fromiter(datasets.tent_map(x_start, n, float(mu)), dtype="float32") + for mu in param_range + ] + ) + # for the tent map the lyapunov exponent is much easier to calculate + # since the values are multiplied by mu in each step, two trajectories + # starting in x and x + delta will have a distance of delta * mu^n after n + # steps. Therefore the lyapunov exponent should be log(mu). + lambdas = np.log(param_range, where=param_range > 0) + lambdas[np.where(param_range <= 0)] = np.nan + else: + msg = f"maptype {maptype} not recognized" + raise ValueError(msg) + + kwargs_e = {"emb_dim": 6, "matrix_dim": 2} + kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} + lambdas_e = [np.max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] # pyright: ignore reportCallIssue + lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] # pyright: ignore reportCallIssue + bifur_x = np.repeat(param_range, nbifur) + bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) + + plt.title(f"Lyapunov exponent of the {maptype} map") + plt.plot(param_range, lambdas, "b-", label="true lyap. exponent") + elab = "estimation using lyap_e" + rlab = "estimation using lyap_r" + plt.plot(param_range, lambdas_e, color="#00AAAA", label=elab) + plt.plot(param_range, lambdas_r, color="#AA00AA", label=rlab) + plt.plot(param_range, np.zeros(len(param_range)), "g--") + plt.plot(bifur_x, bifur, "ro", alpha=0.1, label="bifurcation plot") + plt.ylim((-2, 2)) + plt.xlabel(param_name) + plt.ylabel(f"lyap. exp / {maptype}(x, {param_name})") + plt.legend(loc="best") + plt.show() + + +def profiling() -> None: + """Runs a profiling test for the function ``lyap_e``. + + This function is mainly used for development and requires the package ``cProfile``. + """ + import cProfile + + n = 10000 + rng = np.random.default_rng(2030628104) + data = np.cumsum(rng.random(n) - 0.5) + cProfile.runctx("lyap_e(data)", {"lyap_e": nolds.lyap_e}, {"data": data}) + + +def hurst_compare_nvals(data: NumberArrayLike1D, nvals: IntArray1D | None = None) -> None: + """Creates a plot that compares the results of different nvals for the function hurst_rs. + + Args: + data: the input data from which the hurst exponent should be estimated + nvals: a manually selected value for the nvals parameter that should be plotted + in comparison to the default choices + """ + import matplotlib.pyplot as plt + + data = np.asarray(data) + n_all = np.arange(2, len(data) + 1) + dd_all = nolds.hurst_rs(data, nvals=n_all, debug_data=True, fit="poly") + dd_def = nolds.hurst_rs(data, debug_data=True, fit="poly") + n_def = np.round(np.exp(dd_def[1][0])).astype("int32") + n_div = n_all[np.where(len(data) % n_all[:-1] == 0)] + dd_div = nolds.hurst_rs(data, nvals=n_div, debug_data=True, fit="poly") + + def corr(nvals: IntArray1D) -> list[np.float64]: + """Calculacte correction offset using expected_rs.""" + return [np.log(nolds.expected_rs(n)) for n in nvals] + + l_all = plt.plot(dd_all[1][0], dd_all[1][1] - corr(n_all), "o") + l_def = plt.plot(dd_def[1][0], dd_def[1][1] - corr(n_def), "o") + l_div = plt.plot(dd_div[1][0], dd_div[1][1] - corr(n_div), "o") + l_cst = [] + t_cst = [] + + if nvals is not None: + dd_cst = nolds.hurst_rs(data, nvals=nvals, debug_data=True, fit="poly") + l_cst = plt.plot(dd_cst[1][0], dd_cst[1][1] - corr(nvals), "o") + t_cst = ["custom"] + plt.xlabel("log(n)") + plt.ylabel("log((R/S)_n - E[(R/S)_n])") + plt.legend( + l_all + l_def + l_div + l_cst, + ["all", "default", "divisors", *t_cst], + ) + plt.show() + + +def aste_line_fitting(N: int = 100) -> None: + """Proves equivalence of T. Aste's original MATLAB code and `np.polyfit`.""" + rng = np.random.default_rng(528714688) + slope = rng.random() * 10 - 5 + intercept = rng.random() * 100 - 50 + xvals = np.arange(N) + yvals = xvals * slope + intercept + rng.standard_normal(N) * 100 + import matplotlib.pyplot as plt + + plt.plot(xvals, yvals, "rx", label="data") + plt.plot( + [0, N - 1], + [intercept, intercept + slope * (N - 1)], + "r-", + label=f"true ({slope:.3f} x + {intercept:.3f})", + alpha=0.5, + ) + i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) # noqa: SLF001 + s_np, i_np = np.polyfit(xvals, yvals, 1) + plt.plot( + [0, N - 1], + [i_aste, i_aste + s_aste * (N - 1)], + "b-", + label=f"aste ({s_aste:.3f} x + {i_aste:.3f})", + alpha=0.5, + ) + plt.plot( + [0, N - 1], + [i_np, i_np + s_np * (N - 1)], + "g-", + label=f"numpy ({s_np:.3f} x + {i_np:.3f})", + alpha=0.5, + ) + plt.legend() + plt.show() + + +def hurst_mf_stock(*, debug: bool = False) -> None: + """Recreates results from [mfs_1]_ (table at start of section 4) as print output. + + Unfortunately as a layman in finance, I could not determine the exact data + that Di Matteo et al. used. Instead I use the data from + `nolds.datasets.load_financial()`. + + Plots H(2) for the following datasets and algorithms. + + Datasets (opening values from `load_financial()`): + + - jkse: Jakarta Composite Index + - n225: Nikkei 225 + - ndx: NASDAQ 100 + + Algorithms: + + - mfhurst_b: GHE according to Barabási et al. + - mfhurst_b + dt: like mfhurst_b, but with linear detrending performed first + - mfhurst_dm: GHE according to Di Matteo et al. (should be identical to + _genhurst) + - _genhurst: GHE according to translated MATLAB code by T. Aste (one of the + co-authors of Di Matteo). + + References: + + .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. + + Args: + debug: if `True`, a debug plot will be shown for each calculated GHE value + except for the ones generated by `_genhurst`. + """ + financial = [ + (datasets.jkse, "jkse"), + (datasets.n225, "n225"), + (datasets.ndx, "ndx"), ] - elif maptype == "tent": - param_name = "$\\mu$" - param_range = np.arange(0, 2, 0.01) - full_data = np.array([ - np.fromiter(datasets.tent_map(x_start, n, mu), dtype="float32") - for mu in param_range - ]) - # for the tent map the lyapunov exponent is much easier to calculate - # since the values are multiplied by mu in each step, two trajectories - # starting in x and x + delta will have a distance of delta * mu^n after n - # steps. Therefore the lyapunov exponent should be log(mu). - lambdas = np.log(param_range, where=param_range > 0) - lambdas[np.where(param_range <= 0)] = np.nan - else: - raise Error("maptype %s not recognized" % maptype) - - kwargs_e = {"emb_dim": 6, "matrix_dim": 2} - kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} - lambdas_e = [max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] - lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] - bifur_x = np.repeat(param_range, nbifur) - bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) - - plt.title("Lyapunov exponent of the %s map" % maptype) - plt.plot(param_range, lambdas, "b-", label="true lyap. exponent") - elab = "estimation using lyap_e" - rlab = "estimation using lyap_r" - plt.plot(param_range, lambdas_e, color="#00AAAA", label=elab) - plt.plot(param_range, lambdas_r, color="#AA00AA", label=rlab) - plt.plot(param_range, np.zeros(len(param_range)), "g--") - plt.plot(bifur_x, bifur, "ro", alpha=0.1, label="bifurcation plot") - plt.ylim((-2, 2)) - plt.xlabel(param_name) - plt.ylabel("lyap. exp / %s(x, %s)" % (maptype, param_name)) - plt.legend(loc="best") - plt.show() - - -def profiling(): - """ - Runs a profiling test for the function ``lyap_e`` (mainly used for - development) - - This function requires the package ``cProfile``. - """ - import cProfile - n = 10000 - data = np.cumsum(np.random.random(n) - 0.5) - cProfile.runctx('lyap_e(data)', {'lyap_e': nolds.lyap_e}, {'data': data}) - - -def hurst_compare_nvals(data, nvals=None): - """ - Creates a plot that compares the results of different choices for nvals - for the function hurst_rs. - - Args: - data (array-like of float): - the input data from which the hurst exponent should be estimated - - Kwargs: - nvals (array of int): - a manually selected value for the nvals parameter that should be plotted - in comparison to the default choices - """ - import matplotlib.pyplot as plt - data = np.asarray(data) - n_all = np.arange(2, len(data)+1) - dd_all = nolds.hurst_rs(data, nvals=n_all, debug_data=True, fit="poly") - dd_def = nolds.hurst_rs(data, debug_data=True, fit="poly") - n_def = np.round(np.exp(dd_def[1][0])).astype("int32") - n_div = n_all[np.where(len(data) % n_all[:-1] == 0)] - dd_div = nolds.hurst_rs(data, nvals=n_div, debug_data=True, fit="poly") - - def corr(nvals): - return [np.log(nolds.expected_rs(n)) for n in nvals] - - l_all = plt.plot(dd_all[1][0], dd_all[1][1] - corr(n_all), "o") - l_def = plt.plot(dd_def[1][0], dd_def[1][1] - corr(n_def), "o") - l_div = plt.plot(dd_div[1][0], dd_div[1][1] - corr(n_div), "o") - l_cst = [] - t_cst = [] - - if nvals is not None: - dd_cst = nolds.hurst_rs(data, nvals=nvals, debug_data=True, fit="poly") - l_cst = plt.plot(dd_cst[1][0], dd_cst[1][1] - corr(nvals), "o") - t_cst = ["custom"] - plt.xlabel("log(n)") - plt.ylabel("log((R/S)_n - E[(R/S)_n])") - plt.legend( - l_all + l_def + l_div + l_cst, ["all", "default", "divisors"] + t_cst - ) - labeled_data = zip([dd_all[0], dd_def[0], dd_div[0]], ["all", "def", "div"]) - for data, label in labeled_data: - print("%s: %.3f" % (label, data)) - if nvals is not None: - print("custom: %.3f" % dd_cst[0]) - plt.show() - -def sampen_default_tolerance(): - data = list(datasets.logistic_map(0.34, 1000, r=3.9)) - oldtol = 0.2 * np.std(data, ddof=1) - old_res = [ - nolds.sampen(data, emb_dim=i, tolerance=oldtol) - for i in range(1, 30) - ] - new_res = [ - nolds.sampen(data, emb_dim=i) - for i in range(1, 30) - ] - for i, old, new in zip(range(1, 30), old_res, new_res): - print("emb_dim={} old={:.3f} corrected={:.3f}".format(i, old, new)) - print(" old variance: {:.3f}".format(np.var(old_res))) - print("corrected variance: {:.3f}".format(np.var(new_res))) - -def aste_line_fitting(N=100): - """ - Shows plot that proves that the line fitting in T. Astes original MATLAB code - provides the same results as `np.polyfit`. - """ - slope = np.random.random() * 10 - 5 - intercept = np.random.random() * 100 - 50 - xvals = np.arange(N) - yvals = xvals * slope + intercept + np.random.randn(N)*100 - import matplotlib.pyplot as plt - plt.plot(xvals, yvals, "rx", label="data") - plt.plot( - [0, N-1], [intercept, intercept + slope * (N-1)], - "r-", label="true ({:.3f} x + {:.3f})".format(slope, intercept), alpha=0.5 - ) - i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) - s_np, i_np = np.polyfit(xvals, yvals, 1) - plt.plot( - [0, N-1], [i_aste, i_aste + s_aste * (N-1)], - "b-", label="aste ({:.3f} x + {:.3f})".format(s_aste, i_aste), alpha=0.5 - ) - plt.plot( - [0, N-1], [i_np, i_np + s_np * (N-1)], - "g-", label="numpy ({:.3f} x + {:.3f})".format(s_np, i_np), alpha=0.5 - ) - plt.legend() - plt.show() - - -def hurst_mf_stock(debug=False): - """ - Recreates results from [mfs_1]_ (table at start of section 4) as print - output. - - Unfortunately as a layman in finance, I could not determine the exact data - that Di Matteo et al. used. Instead I use the data from - `nolds.datasets.load_financial()`. - - Plots H(2) for the following datasets and algorithms. - - Datasets (opening values from `load_financial()`): - - - jkse: Jakarta Composite Index - - n225: Nikkei 225 - - ndx: NASDAQ 100 - - Algorithms: - - - mfhurst_b: GHE according to Barabási et al. - - mfhurst_b + dt: like mfhurst_b, but with linear detrending performed first - - mfhurst_dm: GHE according to Di Matteo et al. (should be identical to - _genhurst) - - _genhurst: GHE according to translated MATLAB code by T. Aste (one of the - co-authors of Di Matteo). - - References: - - .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. - - Kwargs: - debug (boolean): - if `True`, a debug plot will be shown for each calculated GHE value - except for the ones generated by `_genhurst`. - """ - print("Dataset mfhurst_b mfhurst_b + dt mfhurst_dm _genhurst") - financial = [ - (datasets.jkse, "jkse"), (datasets.n225, "n225"), (datasets.ndx, "ndx") - ] - for data, lab in financial: - data = data[1][:, 0] - data = np.log(data) - dists = range(1, 20) - mfh_b = nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] - mfh_b_dt = nolds.mfhurst_b( - nolds.detrend_data(data, order=1), - qvals=[2], dists=dists, debug_plot=debug - )[0] - mfh_dm = nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] - gh = nolds._genhurst(data, 2) - print("{:10s} {:5.3f} {:5.3f} {:5.3f} {:5.3f}".format(lab, mfh_b, mfh_b_dt, mfh_dm, gh)) - - -def barabasi_1991_figure2(): - """ - Recreates figure 2 from [bf2]_. - - This figure compares calculated and estimated values for H(q) for - a fractal generated by 9 iterations of the `barabasi1991_fractal` function - with b1 = 0.8 and b2 = 0.5. - - References: - .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - """ - import matplotlib.pyplot as plt - b1991 = datasets.barabasi1991_fractal(10000000, 9) - qvals = range(1, 11) - qvals_t = range(-10, 11) - b1 = 0.8 - b2 = 0.5 - dists = [4 ** i for i in range(6, 11)] - # dists = nolds.logarithmic_n(100, 0.01 * len(b1991), 2) - Hq = nolds.mfhurst_b(b1991, qvals=qvals, dists=dists) - Hq_t = [np.log((b1 ** q + b2 ** q) / 2) / np.log(0.25) / q for q in qvals_t] - plt.plot(qvals, Hq, "r+", label="mfhurst_b") - plt.plot(qvals_t, Hq_t, label="calculated value") - plt.legend(loc="best") - plt.xlabel("q") - plt.ylabel("H(q)") - plt.show() - - -def barabasi_1991_figure3(): - """ - Recreates figure 3 from [bf3]_. - - This figure compares calculated and estimated values for H(q) for a simple - Brownian motion that moves in unit steps (-1 or +1) in each time step. - - References: - .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - """ - import matplotlib.pyplot as plt - brown = np.cumsum(np.random.randint(0, 2, size=10000000)*2-1) - qvals = [-5, -4, -3, -2, -1.1, 0.1, 1, 2, 3, 4, 5] - Hq_t = [0.5 if q > -1 else -0.5/q for q in qvals] - dists = [2 ** i for i in range(6, 15)] - # dists = nolds.logarithmic_n(100, 0.01 * len(brown), 1.5) - Hq = nolds.mfhurst_b(brown, qvals=qvals, dists=dists, debug_plot=False) - plt.plot(qvals, Hq, "r+", label="mfhurst_b") - plt.plot(qvals, Hq_t, label="calculated value") - plt.ylim(0, 1) - plt.legend(loc="best") - plt.xlabel("q") - plt.ylabel("H(q)") - plt.show() - - -def lorenz(): - """ - Calculates different measures for the Lorenz system of ordinary - differential equations and compares nolds results with prescribed - results from the literature. - - The Lorenz system is a three dimensional dynamical system given - by the following equations: - - dx/dt = sigma * (y - x) - dy/dt = rho * x - y - x * z - dz/dt = x * y - beta * z - - To test the reconstruction of higher-dimensional phenomena from - one-dimensional data, the lorenz system is simulated with a - simple Euler method and then the x-, y-, and z-values are used - as one-dimensional input for the nolds algorithms. - - Parameters for Lorenz system: - - - sigma = 10 - - rho = 28 - - beta = 8/3 - - dt = 0.012 - - Algorithms: - - - ``lyap_r`` with min_tsep=1000, emb_dim=5, tau=0.01, and lag=5 (see [l_4]_) - - ``lyap_e`` with min_tsep=1000, emb_dim=5, matrix_dim=5, and tau=0.01 (see [l_4]_) - - ``corr_dim`` with emb_dim=10, and fit=poly (see [l_1]_) - - ``hurst_rs`` with fit=poly (see [l_3]_) - - ``dfa`` with default parameters (see [l_5]_) - - ``sampen`` with default parameters (see [l_2]_) - - References: - - .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness - of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, - “The effect of time delay on Approximate & Sample Entropy - calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, - pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. - .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series - Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, - 2009, doi: 10.1007/s11207-009-9467-x. - .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and - similarities in the analysis of Lorenz, Chen, and Lu systems,” - Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, - doi: 10.1016/j.amc.2014.12.132. - .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, - and M. Dietz, “A Multivariate Method for Dynamic System Analysis: - Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” - Topics in Cognitive Science, p. tops.12688, Sep. 2023, - doi: 10.1111/tops.12688. - - - """ - import matplotlib.pyplot as plt - sigma = 10 - rho = 28 - beta = 8.0/3 - start = [0, 22, 10] - n = 10000 - skip = 10000 - dt = 0.012 - data = datasets.lorenz_euler(n + skip, sigma, rho, beta, start=start, dt=dt)[skip:] - - # fig = plt.figure() - # ax = fig.add_subplot(111, projection="3d") - # ax.plot(data[:, 0], data[:, 1], data[:, 2]) - # plt.show() - # plt.close(fig) - - lyap_expected = datasets.lorenz_lyap(sigma, rho, beta) - # Rationale for argument values: - # start with medium settings for min_tsep and lag, span a large area with trajectory_len, set fit_offset to 0 - # up the embedding dimension until you get a clear line in the debug plot - # adjust trajectory_len and fit_offset to split off only the linear part - # in general: the longer the linear part of the plot, the better - lyap_r_args = dict(min_tsep=10, emb_dim=5, tau=dt, lag=5, trajectory_len=28, fit_offset=8, fit="poly") - lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) - lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) - lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) - # Rationale for argument values: - # Start with emb_dim=matrix_dim, medium min_tsep and min_nb - # After that, no good guidelines for stability. :( - # -> Just experiment with settings until you get close to expected value. ¯\_(ツ)_/¯ - # NOTE: It seems from this example and `lyapunov-logistic` that lyap_e has a scaling problem. - lyap_e_args = dict(min_tsep=10, emb_dim=5, matrix_dim=5, tau=dt, min_nb=8) - lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) - lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) - lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) - print("Expected Lyapunov exponent: ", lyap_expected) - print("lyap_r(x) : ", lyap_rx) - print("lyap_r(y) : ", lyap_ry) - print("lyap_r(z) : ", lyap_rz) - print("lyap_e(x) : ", lyap_ex) - print("lyap_e(y) : ", lyap_ey) - print("lyap_e(z) : ", lyap_ez) - print() - - # Rationale for argument values: - # Start with moderate settings for lag and a large span of rvals. - # Increase emb_dim until you get a clear line in the debug plot - # Clip rvals to select only the linear part of the plot. - # Increase lag as long as it increases the output. Stop when the output becomes smaller - # (or when you feel that the lag is unreasonably large.) - rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - corr_dim_args = dict(emb_dim=5, lag=10, fit="poly", rvals=rvals) - cdx = nolds.corr_dim(data[:, 0], **corr_dim_args) - cdy = nolds.corr_dim(data[:, 1], **corr_dim_args) - cdz = nolds.corr_dim(data[:, 2], **corr_dim_args) - # reference Grassberger-Procaccia 1983 - print("Expected correlation dimension: 2.05") - print("corr_dim(x) : ", cdx) - print("corr_dim(y) : ", cdy) - print("corr_dim(z) : ", cdz) - print() - - # Rationale for argument values: - # Start with a large range of nvals. - # Reduce those down cutting of the first few data points and then only keep the - # linear-ish looking part of the initial rise. - hurst_rs_args = dict(fit="poly", nvals=nolds.logarithmic_n(10, 70, 1.1)) - hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) - hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) - hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) - # reference: Suyal 2009 - print("Expected hurst exponent: 0.64 < H < 0.93") - print("hurst_rs(x) : ", hx) - print("hurst_rs(y) : ", hy) - print("hurst_rs(z) : ", hz) - print() - - # reference: Wallot 2023, Table 1 - # Rationale for argument values: Just follow paper - # NOTE since DFA is quite fast and Wallot 2023 use different initial values - # (x = y = z = 0.1 + e) and size of data (100k data points, 1000 runs) and - # don't report step size, we use different data here - data_dfa = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] - nvals = nolds.logarithmic_n(200, len(data_dfa)/8, 2**0.2) - dfa_args = dict(nvals=nvals, order=2, overlap=False, fit_exp="poly") - dx = nolds.dfa(data_dfa[:, 0], **dfa_args) - dy = nolds.dfa(data_dfa[:, 1], **dfa_args) - dz = nolds.dfa(data_dfa[:, 2], **dfa_args) - print("Expected hurst parameter: [1.008 ±0.016, 0.926 ±0.016, 0.650 ±0.22]") - print("dfa(x) : ", dx) - print("dfa(y) : ", dy) - print("dfa(z) : ", dz) - print() - - # reference: Kaffashi 2008 - # Rationale for argument values: Just follow paper. - sampen_args = dict(emb_dim=2, lag=1) - sx = nolds.sampen(data[:, 0], **sampen_args) - sy = nolds.sampen(data[:, 1], **sampen_args) - sz = nolds.sampen(data[:, 2], **sampen_args) - print("Expected sample entropy: [0.15, 0.15, 0.25]") - print("sampen(x): ", sx) - print("sampen(y): ", sy) - print("sampen(z): ", sz) + for data, _lab in financial: + timeseries = data[1][:, 0] + timeseries = np.log(timeseries) + dists = range(1, 20) + nolds.mfhurst_b(timeseries, qvals=[2], dists=dists, debug_plot=debug)[0] + nolds.mfhurst_b( + nolds.detrend_data(timeseries, order=1), + qvals=[2], + dists=dists, + debug_plot=debug, + )[0] + nolds.mfhurst_dm(timeseries, qvals=[2], debug_plot=debug)[0][0] + nolds._genhurst(timeseries, 2) # noqa: SLF001 + + +def barabasi_1991_figure2() -> None: + """Recreates figure 2 from [bf2]_. + + This figure compares calculated and estimated values for H(q) for + a fractal generated by 9 iterations of the `barabasi1991_fractal` function + with b1 = 0.8 and b2 = 0.5. + + References: + .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + """ + import matplotlib.pyplot as plt + + b1991 = datasets.barabasi1991_fractal(10000000, 9) + qvals = range(1, 11) + qvals_t = range(-10, 11) + b1 = 0.8 + b2 = 0.5 + dists = [4**i for i in range(6, 11)] + Hq = nolds.mfhurst_b(b1991, qvals=qvals, dists=dists) + Hq_t = [np.log((b1**q + b2**q) / 2) / np.log(0.25) / q for q in qvals_t] + plt.plot(qvals, Hq, "r+", label="mfhurst_b") + plt.plot(qvals_t, Hq_t, label="calculated value") + plt.legend(loc="best") + plt.xlabel("q") + plt.ylabel("H(q)") + plt.show() + + +def barabasi_1991_figure3() -> None: + """Recreates figure 3 from [bf3]_. + + This figure compares calculated and estimated values for H(q) for a simple + Brownian motion that moves in unit steps (-1 or +1) in each time step. + + References: + .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + """ + import matplotlib.pyplot as plt + + rng = np.random.default_rng(2562651293) + brown = np.cumsum(rng.integers(0, 2, size=10000000) * 2 - 1) + qvals = [-5, -4, -3, -2, -1.1, 0.1, 1, 2, 3, 4, 5] + Hq_t = [0.5 if q > -1 else -0.5 / q for q in qvals] + dists = [2**i for i in range(6, 15)] + Hq = nolds.mfhurst_b(brown, qvals=qvals, dists=dists, debug_plot=False) + plt.plot(qvals, Hq, "r+", label="mfhurst_b") + plt.plot(qvals, Hq_t, label="calculated value") + plt.ylim(0, 1) + plt.legend(loc="best") + plt.xlabel("q") + plt.ylabel("H(q)") + plt.show() + + +def lorenz() -> None: + """Compares nolds results with prescribed results for Lorenz system. + + Calculates different measures for the Lorenz system of ordinary + differential equations and compares nolds results with prescribed + results from the literature. + + The Lorenz system is a three dimensional dynamical system given + by the following equations: + + dx/dt = sigma * (y - x) + dy/dt = rho * x - y - x * z + dz/dt = x * y - beta * z + + To test the reconstruction of higher-dimensional phenomena from + one-dimensional data, the lorenz system is simulated with a + simple Euler method and then the x-, y-, and z-values are used + as one-dimensional input for the nolds algorithms. + + Parameters for Lorenz system: + + - sigma = 10 + - rho = 28 + - beta = 8/3 + - dt = 0.012 + + Algorithms: + + - ``lyap_r`` with min_tsep=1000, emb_dim=5, tau=0.01, and lag=5 (see [l_4]_) + - ``lyap_e`` with min_tsep=1000, emb_dim=5, matrix_dim=5, and tau=0.01 (see [l_4]_) + - ``corr_dim`` with emb_dim=10, and fit=poly (see [l_1]_) + - ``hurst_rs`` with fit=poly (see [l_3]_) + - ``dfa`` with default parameters (see [l_5]_) + - ``sampen`` with default parameters (see [l_2]_) + + References: + + .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness + of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, + “The effect of time delay on Approximate & Sample Entropy + calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, + pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. + .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series + Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, + 2009, doi: 10.1007/s11207-009-9467-x. + .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and + similarities in the analysis of Lorenz, Chen, and Lu systems,” + Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, + doi: 10.1016/j.amc.2014.12.132. + .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, + and M. Dietz, “A Multivariate Method for Dynamic System Analysis: + Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” + Topics in Cognitive Science, p. tops.12688, Sep. 2023, + doi: 10.1111/tops.12688. + """ + sigma = 10 + rho = 28 + beta = 8.0 / 3 + start = [0.0, 22.0, 10.0] + n = 10000 + skip = 10000 + dt = 0.012 + data = datasets.lorenz_euler(n + skip, sigma, rho, beta, start=start, dt=dt)[skip:] + + datasets.lorenz_lyap(sigma, rho, beta) + # Rationale for argument values: + # start with medium settings for min_tsep and lag, span a large area with trajectory_len, + # set fit_offset to 0 + # up the embedding dimension until you get a clear line in the debug plot + # adjust trajectory_len and fit_offset to split off only the linear part + # in general: the longer the linear part of the plot, the better + lyap_r_args = { + "min_tsep": 10, + "emb_dim": 5, + "tau": dt, + "lag": 5, + "trajectory_len": 28, + "fit_offset": 8, + "fit": "poly", + } + nolds.lyap_r(data[:, 0], **lyap_r_args) + nolds.lyap_r(data[:, 1], **lyap_r_args) + nolds.lyap_r(data[:, 2], **lyap_r_args) + # Rationale for argument values: + # Start with emb_dim=matrix_dim, medium min_tsep and min_nb + # After that, no good guidelines for stability. :( + # -> Just experiment with settings until you get close to expected value. ¯\_(ツ)_/¯ + # NOTE: It seems from this example and `lyapunov-logistic` that lyap_e has a scaling problem. + lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": dt, "min_nb": 8} + nolds.lyap_e(data[:, 0], **lyap_e_args) + nolds.lyap_e(data[:, 1], **lyap_e_args) + nolds.lyap_e(data[:, 2], **lyap_e_args) + + # Rationale for argument values: + # Start with moderate settings for lag and a large span of rvals. + # Increase emb_dim until you get a clear line in the debug plot + # Clip rvals to select only the linear part of the plot. + # Increase lag as long as it increases the output. Stop when the output becomes smaller + # (or when you feel that the lag is unreasonably large.) + rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally + corr_dim_args = {"emb_dim": 5, "lag": 10, "fit": "poly", "rvals": rvals} + nolds.corr_dim(data[:, 0], **corr_dim_args) + nolds.corr_dim(data[:, 1], **corr_dim_args) + nolds.corr_dim(data[:, 2], **corr_dim_args) + # reference Grassberger-Procaccia 1983 + + # Rationale for argument values: + # Start with a large range of nvals. + # Reduce those down cutting of the first few data points and then only keep the + # linear-ish looking part of the initial rise. + hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} + nolds.hurst_rs(data[:, 0], **hurst_rs_args) + nolds.hurst_rs(data[:, 1], **hurst_rs_args) + nolds.hurst_rs(data[:, 2], **hurst_rs_args) + # reference: Suyal 2009 + + # reference: Wallot 2023, Table 1 + # Rationale for argument values: Just follow paper + # NOTE since DFA is quite fast and Wallot 2023 use different initial values + # (x = y = z = 0.1 + e) and size of data (100k data points, 1000 runs) and + # don't report step size, we use different data here + data_dfa = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ + 20000: + ] + nvals = nolds.logarithmic_n(200, np.ceil(len(data_dfa) / 8), 2**0.2) + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} + nolds.dfa(data_dfa[:, 0], **dfa_args) + nolds.dfa(data_dfa[:, 1], **dfa_args) + nolds.dfa(data_dfa[:, 2], **dfa_args) + + # reference: Kaffashi 2008 + # Rationale for argument values: Just follow paper. + sampen_args = {"emb_dim": 2, "lag": 1} + nolds.sampen(data[:, 0], **sampen_args) # pyright: ignore reportCallIssue + nolds.sampen(data[:, 1], **sampen_args) # pyright: ignore reportCallIssue + nolds.sampen(data[:, 2], **sampen_args) # pyright: ignore reportCallIssue if __name__ == "__main__": - # run this with the following command: - # python -m nolds.examples lyapunov-logistic - import sys - - def print_options(): - print("options are:") - print(" lyapunov-logistic") - print(" lyapunov-tent") - print(" profiling") - print(" hurst-weron2") - print(" hurst-hist") - print(" hurst-nvals") - print(" sampen-tol") - print(" aste-line") - print(" hurst-mf-stock") - print(" lorenz") - if len(sys.argv) < 2: - print("please tell me which tests you want to run") - print_options() - elif sys.argv[1] == "lyapunov-logistic": - plot_lyap() - elif sys.argv[1] == "lyapunov-tent": - plot_lyap("tent") - elif sys.argv[1] == "profiling": - profiling() - elif sys.argv[1] == "hurst-weron2": - n = 1000 if len(sys.argv) < 3 else int(sys.argv[2]) - weron_2002_figure2(n) - elif sys.argv[1] == "hurst-hist": - plot_hurst_hist() - elif sys.argv[1] == "hurst-nvals": - hurst_compare_nvals(datasets.brown72) - elif sys.argv[1] == "sampen-tol": - sampen_default_tolerance() - elif sys.argv[1] == "aste-line": - aste_line_fitting() - elif sys.argv[1] == "hurst-mf-stock": - hurst_mf_stock() - elif sys.argv[1] == "hurst-mf-barabasi2": - barabasi_1991_figure2() - elif sys.argv[1] == "hurst-mf-barabasi3": - barabasi_1991_figure3() - elif sys.argv[1] == "lorenz": - lorenz() - else: - print("i do not know any test of that name") - print_options() + # run this with the following command: + # python -m nolds.examples lyapunov-logistic + + parser = argparse.ArgumentParser( + prog="nolds.examples", description="Run examples for nolds metrics." + ) + parser.add_argument( + "example", + choices=[ + "lyapunov-logistic", + "lyapunov-tent", + "profiling", + "hurst-weron2", + "hurst-hist", + "hurst-nvals", + "aste-line", + "hurst-mf-stock", + "hurst-mf-barabasi2", + "hurst-mf-barabasi3", + "lorenz", + ], + help="Which example to run.", + ) + parser.add_argument( + "--size", + "-n", + default=None, + type=int, + help="number of iterations or datapoints to use for certain examples", + ) + args = parser.parse_args() + + match args.example: + case "lyapunov-logistic": + plot_lyap() + case "lyapunov-tent": + plot_lyap("tent") + case "profiling": + profiling() + case "hurst-weron2": + weron_2002_figure2(1000 if args.size is None else args.size) + case "hurst-hist": + plot_hurst_hist() + case "hurst-nvals": + hurst_compare_nvals(datasets.brown72) + case "aste-line": + aste_line_fitting(100 if args.size is None else args.size) + case "hurst-mf-stock": + hurst_mf_stock() + case "hurst-mf-barabasi2": + barabasi_1991_figure2() + case "hurst-mf-barabasi3": + barabasi_1991_figure3() + case "lorenz": + lorenz() diff --git a/nolds/measures.py b/nolds/measures.py index 7fe4d1b..5c89e2e 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -1,1568 +1,1999 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip +"""Main module containing all measures implemented in nolds.""" + +from __future__ import annotations + +import math +import warnings +from typing import ( + TYPE_CHECKING, + Literal, + TypeAlias, + TypeVar, + cast, + overload, ) + import numpy as np -import warnings -import math + +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + + from numpy.typing import ArrayLike + + D = TypeVar("D", bound=np.integer | np.floating) + # Array type definitions + # NOTE: We define aliases here to save space and to make it easy to update + # the types when numpy settles on a best practice for annotating array dimensions. + IntArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[np.int32]] + FloatArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray2D: TypeAlias = np.ndarray[tuple[int, int], np.dtype[np.float64]] + NumberArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[D]] + NumberArray2D: TypeAlias = np.ndarray[tuple[int, int], np.dtype[D]] + # Define more specific aliases for input data + # NOTE: These don't change anything in type checking, but the type name servers as + # additional documentation for users. + IntArrayLike1D: TypeAlias = ArrayLike # 1D structure containing int values + FloatArrayLike1D: TypeAlias = ArrayLike # 1D structure containing float values + NumberArrayLike1D: TypeAlias = ArrayLike # 1D structure containing number values + +float_precision = np.float64 +"""Default floating point precision used by nolds. + +Within nolds, this is considered static. However, downstream code might want to +change the precision for a specific measure to save time and space. + +This use case is not common enough to warrant an entire API around it, but +by introducing this variable we at least enable it in principle. +""" +int_precision = np.int32 +"""Default integer precision used by nolds. + +Within nolds, this is considered static. However, downstream code might want to +change the precision for a specific measure to save time and space. + +This use case is not common enough to warrant an entire API around it, but +by introducing this variable we at least enable it in principle. +""" + + +def rowwise_chebyshev(x: NumberArray2D, y: NumberArray1D) -> NumberArray1D: + """Returns the Chebyshev distances between each row of matrix x and the reference row y.""" + return np.max(np.abs(x - y), axis=1) -def rowwise_chebyshev(x, y): - return np.max(np.abs(x - y), axis=1) - - -def rowwise_euclidean(x, y): - return np.sqrt(np.sum((x - y)**2, axis=1)) - - -def poly_fit(x, y, degree, fit="RANSAC"): - # check if we can use RANSAC - if fit == "RANSAC": - try: - # ignore ImportWarnings in sklearn - with warnings.catch_warnings(): - warnings.simplefilter("ignore", ImportWarning) - import sklearn.linear_model as sklin - import sklearn.preprocessing as skpre - except ImportError: - warnings.warn( - "fitting mode 'RANSAC' requires the package sklearn, using" - + " 'poly' instead", - RuntimeWarning) - fit = "poly" - - if fit == "poly": - return np.polyfit(x, y, degree) - elif fit == "RANSAC": - model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) - xdat = np.asarray(x) - if len(xdat.shape) == 1: - # interpret 1d-array as list of len(x) samples instead of - # one sample of length len(x) - xdat = xdat.reshape(-1, 1) - polydat = skpre.PolynomialFeatures(degree).fit_transform(xdat) - try: - model.fit(polydat, y) - coef = model.estimator_.coef_[::-1] - except ValueError: - warnings.warn( - "RANSAC did not reach consensus, " - + "using numpy's polyfit", - RuntimeWarning) - coef = np.polyfit(x, y, degree) - return coef - else: - raise ValueError("invalid fitting mode ({})".format(fit)) - - -def delay_embedding(data, emb_dim, lag=1): - """ - Perform a time-delay embedding of a time series - - Args: - data (array-like): - the data that should be embedded - emb_dim (int): - the embedding dimension - Kwargs: - lag (int): - the lag between elements in the embedded vectors - - Returns: - emb_dim x m array: - matrix of embedded vectors of the form - [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] - for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) - """ - data = np.asarray(data) - min_len = (emb_dim - 1) * lag + 1 - if len(data) < min_len: - msg = "cannot embed data of length {} with embedding dimension {} " \ - + "and lag {}, minimum required length is {}" - raise ValueError(msg.format(len(data), emb_dim, lag, min_len)) - m = len(data) - min_len + 1 - indices = np.repeat([np.arange(emb_dim) * lag], m, axis=0) - indices += np.arange(m).reshape((m, 1)) - return data[indices] - - -def lyap_r_len(**kwargs): - """ - Helper function that calculates the minimum number of data points required - to use lyap_r. - - Note that none of the required parameters may be set to None. - - Kwargs: - kwargs(dict): - arguments used for lyap_r (required: emb_dim, lag, trajectory_len and - min_tsep) - - Returns: - minimum number of data points required to call lyap_r with the given - parameters - """ - # minimum length required to find single orbit vector - min_len = (kwargs['emb_dim'] - 1) * kwargs['lag'] + 1 - # we need trajectory_len orbit vectors to follow a complete trajectory - min_len += kwargs['trajectory_len'] - 1 - # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs['min_tsep'] * 2 + 1 - return min_len - - -def lyap_r(data, emb_dim=10, lag=None, min_tsep=None, tau=1, min_neighbors=20, - trajectory_len=20, fit="RANSAC", debug_plot=False, debug_data=False, - plot_file=None, fit_offset=0): - """ - Estimates the largest Lyapunov exponent using the algorithm of Rosenstein - et al. [lr_1]_. - - Explanation of Lyapunov exponents: - See lyap_e. - - Explanation of the algorithm: - The algorithm of Rosenstein et al. is only able to recover the largest - Lyapunov exponent, but behaves rather robust to parameter choices. - - The idea for the algorithm relates closely to the definition of Lyapunov - exponents. First, the dynamics of the data are reconstructed using a delay - embedding method with a lag, such that each value x_i of the data is mapped - to the vector - - X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] - - For each such vector X_i, we find the closest neighbor X_j using the - euclidean distance. We know that as we follow the trajectories from X_i and - X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) - denoted as d_i(k) will increase according to a power law - d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the - highest Lyapunov exponent, because the exponential expansion along the axis - associated with this exponent will quickly dominate the expansion or - contraction along other axes. - - To calculate lambda, we look at the logarithm of the distance trajectory, - because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines - (one for each index i) whose slope is an approximation of lambda. We - therefore extract the mean log trajectory d'(k) by taking the mean of - log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to - the plot of d'(k) versus k. The slope of the line gives the desired - parameter lambda. - - Method for choosing min_tsep: - Usually we want to find neighbors between points that are close in phase - space but not too close in time, because we want to avoid spurious - correlations between the obtained trajectories that originate from temporal - dependencies rather than the dynamic properties of the system. Therefore it - is critical to find a good value for min_tsep. One rather plausible - estimate for this value is to set min_tsep to the mean period of the - signal, which can be obtained by calculating the mean frequency using the - fast fourier transform. This procedure is used by default if the user sets - min_tsep = None. Note that this default procedure uses a naive approach - for estimating the power spectral density, which just takes the FFT of the - whole signal without applying any windowing function to avoid biases. If - you have a non-stationary input and want more than a rough estimate, - consider calculating min_tsep manually using a sliding window approach - like Welch's method (implemented in `scipy.signal.welch`). - - Method for choosing lag: - Another parameter that can be hard to choose by instinct alone is the lag - between individual values in a vector of the embedded orbit. Here, - Rosenstein et al. suggest to set the lag to the distance where the - autocorrelation function drops below 1 - 1/e times its original (maximal) - value. This procedure is used by default if the user sets lag = None. - - References: - .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, - “A practical method for calculating largest Lyapunov exponents from - small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, - pp. 117–134, 1993. - - Reference Code: - .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", - url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm - .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate - Lyapunov exponent", - url: https://ideas.repec.org/c/boc/bocode/t741502.html - .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, "TISEAN 3.0.0 - Nonlinear Time Series Analysis", - url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html - - Args: - data (iterable of float): - (one-dimensional) time series - Kwargs: - emb_dim (int): - embedding dimension for delay embedding - lag (float): - lag for delay embedding - min_tsep (float): - minimal temporal separation between two "neighbors" (default: - find a suitable value by calculating the mean period of the data) - tau (float): - step size between data points in the time series in seconds - (normalization scaling factor for exponents) - min_neighbors (int): - if lag=None, the search for a suitable lag will be stopped when the - number of potential neighbors for a vector drops below min_neighbors - trajectory_len (int): - the time (in number of data points) to follow the distance - trajectories between two neighboring points - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will - be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - fit_offset (int): - neglect the first fit_offset steps when fitting - - Returns: - float: - an estimate of the largest Lyapunov exponent (a positive exponent is - a strong indicator for chaos) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(ks, div_traj, poly)`` where ``ks`` are the x-values of the line fit, - ``div_traj`` are the y-values and ``poly`` are the line coefficients - (``[slope, intercept]``). - - """ - # convert data to float to avoid overflow errors in rowwise_euclidean - data = np.asarray(data, dtype=np.float64) - n = len(data) - max_tsep_factor = 0.25 - if lag is None or min_tsep is None: - # both the algorithm for lag and min_tsep need the fft - f = np.fft.rfft(data, n * 2 - 1) - if min_tsep is None: - # calculate min_tsep as mean period (= 1 / mean frequency) - # to get the mean frequency, we weight the frequency buckets in the - # fft result by the absolute power in that bucket and then divide - # by the total power across all buckets to get a weighted mean. - # This can be inaccurate for non-stationary inputs. A better approach would - # be to use scipy.signal.welch, but this requires making some other - # parameter choices like the size of the sliding window that require some - # knowledge about the input data, which we don't have at this point. - freqs = np.fft.rfftfreq(n * 2 - 1) - psd = np.abs(f)**2 - mf = np.sum(freqs[1:] * psd[1:]) / np.sum(psd[1:]) - min_tsep = int(np.ceil(1.0 / mf)) - if min_tsep > max_tsep_factor * n: - min_tsep = int(max_tsep_factor * n) - msg = "signal has very low mean frequency, setting min_tsep = {:d}" - warnings.warn(msg.format(min_tsep), RuntimeWarning) - if lag is None: - # calculate the lag as point where the autocorrelation drops to (1 - 1/e) - # times its maximum value - # note: the Wiener–Khinchin theorem states that the spectral - # decomposition of the autocorrelation function of a process is the power - # spectrum of that process - # => we can use fft to calculate the autocorrelation - acorr = np.fft.irfft(f * np.conj(f)) - acorr = np.roll(acorr, n - 1) - eps = acorr[n - 1] * (1 - 1.0 / np.e) - lag = 1 - - # small helper function to calculate resulting number of vectors for a - # given lag value - def nb_neighbors(lag_value): - min_len = lyap_r_len( - emb_dim=emb_dim, lag=lag_value, trajectory_len=trajectory_len, - min_tsep=min_tsep - ) - return max(0, n - min_len) - # find lag - for i in range(1, n): - lag = i - if acorr[n - 1 + i] < eps or acorr[n - 1 - i] < eps: - break - if nb_neighbors(i) < min_neighbors: - msg = "autocorrelation declined too slowly to find suitable lag" \ - + ", setting lag to {}" - warnings.warn(msg.format(lag), RuntimeWarning) - break - min_len = lyap_r_len( - emb_dim=emb_dim, lag=lag, trajectory_len=trajectory_len, - min_tsep=min_tsep - ) - if len(data) < min_len: - msg = "for emb_dim = {}, lag = {}, min_tsep = {} and trajectory_len = {}" \ - + " you need at least {} datapoints in your time series" - warnings.warn( - msg.format(emb_dim, lag, min_tsep, trajectory_len, min_len), - RuntimeWarning +def rowwise_euclidean(x: NumberArray2D, y: NumberArray1D) -> NumberArray1D: + """Returns the Euclidean distances between each row of matrix x and the reference row y.""" + return np.sqrt(np.sum((x - y) ** 2, axis=1)) + + +FittingMethod = Literal["RANSAC", "poly"] + + +def poly_fit( + x: NumberArray1D, + y: NumberArray1D, + degree: int, + fit: FittingMethod = "RANSAC", + random_state: int | None = None, +) -> FloatArray1D: + """Fits a polynomial of the given degree to the data. + + This currently supports two fittting algorithms. + + - "poly" uses the standard `np.ployfit` function to perform a least squares fit. + - "RANSAC" uses the RANSAC algorithm, which is more robust to outliers but + introuces inaccuracies due to randomness. + + If "RANSAC" is chosen, but scikit-learn is not installed, "poly" is used as + a fallback option. + + Args: + x: x-axis values + y: y-axis values + degree: degree of the polynomial + fit: algorithm to use for fitting + random_state: Seed for random number generator used for RANSAC + """ + # check if we can use RANSAC + if fit == "RANSAC": + try: + # ignore ImportWarnings in sklearn + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ImportWarning) + import sklearn.linear_model as sklin + import sklearn.preprocessing as skpre + except ImportError: + warnings.warn( + "fitting mode 'RANSAC' requires the package sklearn, using 'poly' instead", + RuntimeWarning, + stacklevel=2, + ) + fit = "poly" + + if fit == "poly": + return np.polyfit(x, y, degree) + if fit == "RANSAC": + model = sklin.RANSACRegressor( + sklin.LinearRegression(fit_intercept=False), random_state=random_state + ) + xdat = np.asarray(x) + if len(xdat.shape) == 1: + # interpret 1d-array as list of len(x) samples instead of + # one sample of length len(x) + xdat = xdat.reshape(-1, 1) + polydat = skpre.PolynomialFeatures(degree).fit_transform(xdat) + try: + model.fit(polydat, y) + coef = cast("sklin.LinearRegression", model.estimator_).coef_[::-1] + except ValueError: + warnings.warn( + "RANSAC did not reach consensus, using numpy's polyfit", + RuntimeWarning, + stacklevel=2, + ) + coef = np.polyfit(x, y, degree) + return coef + msg = f"invalid fitting mode ({fit})" + raise ValueError(msg) + + +def delay_embedding(data: NumberArrayLike1D, emb_dim: int, lag: int = 1) -> FloatArray2D: + """Perform a time-delay embedding of a time series. + + Args: + data: the data that should be embedded + emb_dim: the embedding dimension + lag: the lag between elements in the embedded vectors + + Returns: + Matrix of shape (m, emb_dim) containing embedded vectors of the form + [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] + for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) + """ + if not isinstance(data, np.ndarray): + data = np.asarray(data, dtype=float_precision) + min_len = (emb_dim - 1) * lag + 1 + if len(data) < min_len: + msg = ( + "cannot embed data of length {} with embedding dimension {} " + "and lag {}, minimum required length is {}" + ) + raise ValueError(msg.format(len(data), emb_dim, lag, min_len)) + m = len(data) - min_len + 1 + indices = np.repeat([np.arange(emb_dim) * lag], m, axis=0) + indices += np.arange(m).reshape((m, 1)) + return data[indices] + + +def lyap_r_len(emb_dim: int, lag: int, trajectory_len: int, min_tsep: int) -> int: + """Calculates the minimum number of data points required to use lyap_r. + + Note that none of the required parameters may be set to None. + + Args: + emb_dim: embedding dimension for delay embedding + lag: lag for delay embedding + min_tsep: minimal temporal separation (in number of data points) between two "neighbors" + trajectory_len: the time (in number of data points) to follow the distance + trajectories between two neighboring points + + + Returns: + minimum number of data points required to call lyap_r with the given + parameters + """ + # minimum length required to find single orbit vector + min_len = (emb_dim - 1) * lag + 1 + # we need trajectory_len orbit vectors to follow a complete trajectory + min_len += trajectory_len - 1 + # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each + min_len += min_tsep * 2 + 1 + return min_len + + +@overload +def lyap_r( + data: NumberArrayLike1D, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + fit_offset: int = 0, + random_state: int | None = None, +) -> float_precision: ... + + +@overload +def lyap_r( + data: NumberArrayLike1D, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + fit_offset: int = 0, + random_state: int | None = None, +) -> tuple[ + float_precision, + tuple[ + IntArray1D, + FloatArray1D, + FloatArray1D, + ], +]: ... + + +def lyap_r( # noqa: C901, PLR0912, PLR0915 + data: NumberArrayLike1D, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + fit_offset: int = 0, + random_state: int | None = None, +) -> ( + float_precision + | tuple[ + float, + tuple[ + IntArray1D, + FloatArray1D, + FloatArray1D, + ], + ] +): + """Estimates the largest Lyapunov exponent with the method of Rosenstein et al. [lr_1]_. + + Explanation of Lyapunov exponents: + See lyap_e. + + Explanation of the algorithm: + The algorithm of Rosenstein et al. is only able to recover the largest + Lyapunov exponent, but behaves rather robust to parameter choices. + + The idea for the algorithm relates closely to the definition of Lyapunov + exponents. First, the dynamics of the data are reconstructed using a delay + embedding method with a lag, such that each value x_i of the data is mapped + to the vector + + X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] + + For each such vector X_i, we find the closest neighbor X_j using the + euclidean distance. We know that as we follow the trajectories from X_i and + X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) + denoted as d_i(k) will increase according to a power law + d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the + highest Lyapunov exponent, because the exponential expansion along the axis + associated with this exponent will quickly dominate the expansion or + contraction along other axes. + + To calculate lambda, we look at the logarithm of the distance trajectory, + because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines + (one for each index i) whose slope is an approximation of lambda. We + therefore extract the mean log trajectory d'(k) by taking the mean of + log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to + the plot of d'(k) versus k. The slope of the line gives the desired + parameter lambda. + + Method for choosing min_tsep: + Usually we want to find neighbors between points that are close in phase + space but not too close in time, because we want to avoid spurious + correlations between the obtained trajectories that originate from temporal + dependencies rather than the dynamic properties of the system. Therefore it + is critical to find a good value for min_tsep. One rather plausible + estimate for this value is to set min_tsep to the mean period of the + signal, which can be obtained by calculating the mean frequency using the + fast fourier transform. This procedure is used by default if the user sets + min_tsep = None. Note that this default procedure uses a naive approach + for estimating the power spectral density, which just takes the FFT of the + whole signal without applying any windowing function to avoid biases. If + you have a non-stationary input and want more than a rough estimate, + consider calculating min_tsep manually using a sliding window approach + like Welch's method (implemented in `scipy.signal.welch`). + + Method for choosing lag: + Another parameter that can be hard to choose by instinct alone is the lag + between individual values in a vector of the embedded orbit. Here, + Rosenstein et al. suggest to set the lag to the distance where the + autocorrelation function drops below 1 - 1/e times its original (maximal) + value. This procedure is used by default if the user sets lag = None. + + References: + .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, + “A practical method for calculating largest Lyapunov exponents from + small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, + pp. 117–134, 1993. + + Reference Code: + .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", + url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm + .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate + Lyapunov exponent", + url: https://ideas.repec.org/c/boc/bocode/t741502.html + .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, + "TISEAN 3.0.0 - Nonlinear Time Series Analysis", + url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html + + Args: + data: (one-dimensional) time series + emb_dim: embedding dimension for delay embedding + lag: lag for delay embedding + min_tsep: minimal temporal separation between two "neighbors" (default: + find a suitable value by calculating the mean period of the data) + tau: step size between data points in the time series in seconds + (normalization scaling factor for exponents) + min_neighbors: if lag=None, the search for a suitable lag will be stopped when the + number of potential neighbors for a vector drops below min_neighbors + trajectory_len: the time (in number of data points) to follow the distance + trajectories between two neighboring points + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will + be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + fit_offset: neglect the first fit_offset steps when fitting + random_state: Seed for random number generator used for RANSAC + + + Returns: + An estimate of the largest Lyapunov exponent (a positive exponent is + a strong indicator for chaos). If `debug_data = True`, the return + type is a tuple instead with the second element being another tuple + containing + + - the x-values of the line fit + - the y-values of the line fit + - the line coefficients (`[slope, intercept]`). + """ + # convert data to float to avoid overflow errors in rowwise_euclidean + data = np.asarray(data, dtype=float_precision) + n = len(data) + max_tsep_factor = 0.25 + if lag is None or min_tsep is None: + # both the algorithm for lag and min_tsep need the fft + f = np.fft.rfft(data, n * 2 - 1) + if min_tsep is None: + # calculate min_tsep as mean period (= 1 / mean frequency) + # to get the mean frequency, we weight the frequency buckets in the + # fft result by the absolute power in that bucket and then divide + # by the total power across all buckets to get a weighted mean. + # This can be inaccurate for non-stationary inputs. A better approach would + # be to use scipy.signal.welch, but this requires making some other + # parameter choices like the size of the sliding window that require some + # knowledge about the input data, which we don't have at this point. + freqs = np.fft.rfftfreq(n * 2 - 1) + psd = np.abs(f) ** 2 + mf = np.sum(freqs[1:] * psd[1:]) / np.sum(psd[1:]) + min_tsep = int(np.ceil(1.0 / mf)) + if min_tsep > max_tsep_factor * n: + min_tsep = int(max_tsep_factor * n) + msg = "signal has very low mean frequency, setting min_tsep = {:d}" + warnings.warn(msg.format(min_tsep), RuntimeWarning, stacklevel=2) + if lag is None: + # calculate the lag as point where the autocorrelation drops to (1 - 1/e) + # times its maximum value + # note: the Wiener–Khinchin theorem states that the spectral + # decomposition of the autocorrelation function of a process is the power + # spectrum of that process + # => we can use fft to calculate the autocorrelation + acorr = np.fft.irfft(f * np.conj(f)) + acorr = np.roll(acorr, n - 1) + eps = acorr[n - 1] * (1 - 1.0 / np.e) + lag = 1 + + def nb_neighbors(lag_value: int) -> int: + """Returns resulting number of vectors for a given lag value.""" + min_len = lyap_r_len( + emb_dim=emb_dim, + lag=lag_value, + trajectory_len=trajectory_len, + min_tsep=min_tsep, + ) + return max(0, n - min_len) + + # find lag + for i in range(1, n): + lag = i + if acorr[n - 1 + i] < eps or acorr[n - 1 - i] < eps: + break + if nb_neighbors(i) < min_neighbors: + msg = "autocorrelation declined too slowly to find suitable lag, setting lag to {}" + warnings.warn(msg.format(lag), RuntimeWarning, stacklevel=2) + break + min_len = lyap_r_len( + emb_dim=emb_dim, + lag=lag, + trajectory_len=trajectory_len, + min_tsep=min_tsep, ) - # delay embedding - orbit = delay_embedding(data, emb_dim, lag) - m = len(orbit) - # construct matrix with pairwise distances between vectors in orbit - dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)]) - # we do not want to consider vectors as neighbor that are less than min_tsep - # time steps together => mask the distances min_tsep to the right and left of - # each index by setting them to infinity (will never be considered as nearest - # neighbors) - for i in range(m): - dists[i, max(0, i - min_tsep):i + min_tsep + 1] = float("inf") - # check that we have enough data points to continue - ntraj = m - trajectory_len + 1 - min_traj = min_tsep * 2 + 2 # in each row min_tsep + 1 disances are inf - if ntraj <= 0: - msg = "Not enough data points. Need {} additional data points to follow " \ - + "a complete trajectory." - raise ValueError(msg.format(-ntraj+1)) - if ntraj < min_traj: - # not enough data points => there are rows where all values are inf - assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)) - msg = "Not enough data points. At least {} trajectories are required " \ - + "to find a valid neighbor for each orbit vector with min_tsep={} " \ - + "but only {} could be created." - raise ValueError(msg.format(min_traj, min_tsep, ntraj)) - assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)) - # find nearest neighbors (exclude last columns, because these vectors cannot - # be followed in time for trajectory_len steps) - nb_idx = np.argmin(dists[:ntraj, :ntraj], axis=1) - - # build divergence trajectory by averaging distances along the trajectory - # over all neighbor pairs - div_traj = np.zeros(trajectory_len, dtype=float) - for k in range(trajectory_len): - # calculate mean trajectory distance at step k - indices = (np.arange(ntraj) + k, nb_idx + k) - div_traj_k = dists[indices] - # filter entries where distance is zero (would lead to -inf after log) - nonzero = np.where(div_traj_k != 0) - if len(nonzero[0]) == 0: - # if all entries where zero, we have to use -inf - div_traj[k] = -np.inf + if len(data) < min_len: + msg = ( + "for emb_dim = {}, lag = {}, min_tsep = {} and trajectory_len = {}" + " you need at least {} datapoints in your time series" + ) + warnings.warn( + msg.format(emb_dim, lag, min_tsep, trajectory_len, min_len), + RuntimeWarning, + stacklevel=2, + ) + # delay embedding + orbit = delay_embedding(data, emb_dim, lag) + m = len(orbit) + # construct matrix with pairwise distances between vectors in orbit + dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)], dtype=float_precision) + # we do not want to consider vectors as neighbor that are less than min_tsep + # time steps together => mask the distances min_tsep to the right and left of + # each index by setting them to infinity (will never be considered as nearest + # neighbors) + for i in range(m): + dists[i, max(0, i - min_tsep) : i + min_tsep + 1] = float("inf") + # check that we have enough data points to continue + ntraj = m - trajectory_len + 1 + min_traj = min_tsep * 2 + 2 # in each row min_tsep + 1 disances are inf + if ntraj <= 0: + msg = ( + "Not enough data points. Need {} additional data points to follow " + "a complete trajectory." + ) + raise ValueError(msg.format(-ntraj + 1)) + if ntraj < min_traj: + # not enough data points => there are rows where all values are inf + assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), "no inf rows found" + msg = ( + "Not enough data points. At least {} trajectories are required " + "to find a valid neighbor for each orbit vector with min_tsep={} " + "but only {} could be created." + ) + raise ValueError(msg.format(min_traj, min_tsep, ntraj)) + assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)), ( + "some distances are not finite" + ) + # find nearest neighbors (exclude last columns, because these vectors cannot + # be followed in time for trajectory_len steps) + nb_idx = np.argmin(dists[:ntraj, :ntraj], axis=1) + + # build divergence trajectory by averaging distances along the trajectory + # over all neighbor pairs + div_traj = np.zeros(trajectory_len, dtype=float_precision) + for k in range(trajectory_len): + # calculate mean trajectory distance at step k + indices = (np.arange(ntraj) + k, nb_idx + k) + div_traj_k = dists[indices] + # filter entries where distance is zero (would lead to -inf after log) + nonzero = np.where(div_traj_k != 0) + if len(nonzero[0]) == 0: + # if all entries where zero, we have to use -inf + div_traj[k] = -np.inf + else: + div_traj[k] = np.mean(np.log(div_traj_k[nonzero])) + # filter -inf entries from mean trajectory + ks = np.arange(trajectory_len) + finite = np.where(np.isfinite(div_traj)) + ks = ks[finite] + div_traj = div_traj[finite] + if len(ks) < 1: + # if all points or all but one point in the trajectory is -inf, we cannot + # fit a line through the remaining points => return -inf as exponent + poly = np.array([-np.inf, 0], dtype=float_precision) else: - div_traj[k] = np.mean(np.log(div_traj_k[nonzero])) - # filter -inf entries from mean trajectory - ks = np.arange(trajectory_len) - finite = np.where(np.isfinite(div_traj)) - ks = ks[finite] - div_traj = div_traj[finite] - if len(ks) < 1: - # if all points or all but one point in the trajectory is -inf, we cannot - # fit a line through the remaining points => return -inf as exponent - poly = [-np.inf, 0] - else: - # normal line fitting - poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) - if debug_plot: - plot_reg( - ks[fit_offset:], div_traj[fit_offset:], - poly, "k", "log(d(k))", fname=plot_file) - le = poly[0] / tau - if debug_data: - return (le, (ks, div_traj, poly)) - else: + # normal line fitting + poly = poly_fit( + ks[fit_offset:], + div_traj[fit_offset:], + 1, + fit=fit, + random_state=random_state, + ) + if debug_plot: + plot_reg( + ks[fit_offset:].astype(float_precision), + div_traj[fit_offset:], + poly, + "k", + "log(d(k))", + fname=plot_file, + ) + le = poly[0] / tau + if debug_data: + return (le, (ks, div_traj, poly)) return le -def lyap_e_len(**kwargs): - """ - Helper function that calculates the minimum number of data points required - to use lyap_e. - - Note that none of the required parameters may be set to None. - - Kwargs: - kwargs(dict): - arguments used for lyap_e (required: emb_dim, matrix_dim, min_nb - and min_tsep) - - Returns: - minimum number of data points required to call lyap_e with the given - parameters - """ - m = (kwargs['emb_dim'] - 1) // (kwargs['matrix_dim'] - 1) - # minimum length required to find single orbit vector - min_len = kwargs['emb_dim'] - # we need to follow each starting point of an orbit vector for m more steps - min_len += m - # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs['min_tsep'] * 2 - # we need at least min_nb neighbors for each orbit vector - min_len += kwargs['min_nb'] - return min_len - - -def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, - debug_plot=False, debug_data=False, plot_file=None): - """ - Estimates the Lyapunov exponents for the given data using the algorithm of - Eckmann et al. [le_1]_. - - Recommendations for parameter settings by Eckmann et al.: - * long recording time improves accuracy, small tau does not - * use large values for emb_dim - * matrix_dim should be 'somewhat larger than the expected number of - positive Lyapunov exponents' - * min_nb = min(2 * matrix_dim, matrix_dim + 4) - - Explanation of Lyapunov exponents: - The Lyapunov exponent describes the rate of separation of two - infinitesimally close trajectories of a dynamical system in phase space. - In a chaotic system, these trajectories diverge exponentially following - the equation: - - \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| - - In this equation X(t, X_0) is the trajectory of the system X starting at - the point X_0 in phase space at time t. eps is the (infinitesimal) - difference vector and lambda is called the Lyapunov exponent. If the - system has more than one free variable, the phase space is - multidimensional and each dimension has its own Lyapunov exponent. The - existence of at least one positive Lyapunov exponent is generally seen as - a strong indicator for chaos. - - Explanation of the Algorithm: - To calculate the Lyapunov exponents analytically, the Jacobian of the - system is required. The algorithm of Eckmann et al. therefore tries to - estimate this Jacobian by reconstructing the dynamics of the system from - which the time series was obtained. For this, several steps are required: - - * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim - dimensions (map each point x_i of the time series to a vector - [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). - * For each vector X_i in this orbit find a radius r_i so that at least - min_nb other vectors lie within (chebyshev-)distance r_i around X_i. - These vectors will be called "neighbors" of X_i. - * Find the Matrix T_i that sends points from the neighborhood of X_i to - the neighborhood of X_(i+1). To avoid undetermined values in T_i, we - construct T_i not with size (emb_dim x emb_dim) but with size - (matrix_dim x matrix_dim), so that we have a larger "step size" m in the - X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), - ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible - by matrix_dim-1. The T_i are then found by a linear least squares fit, - assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the - neighborhood of X_i. - * Starting with i = 1 and Q_0 = identity successively decompose the matrix - T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. - * Calculate the Lyapunov exponents from the mean of the logarithm of the - diagonal elements of the matrices R_i. To normalize the Lyapunov - exponents, they have to be divided by m and by the step size tau of the - original time series. - - References: - .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, - “Liapunov exponents from time series,” Physical Review A, - vol. 34, no. 6, pp. 4971–4979, 1986. - - Reference code: - .. [le_a] Manfred Füllsack, "Lyapunov exponent", - url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), - url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m - .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, - url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html - - Args: - data (array-like of float): - (scalar) data points - - Kwargs: - emb_dim (int): - embedding dimension - matrix_dim (int): - matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) - min_nb (int): - minimal number of neighbors - (default: min(2 * matrix_dim, matrix_dim + 4)) - min_tsep (int): - minimal temporal separation between two "neighbors" - tau (float): - step size of the data in seconds - (normalization scaling factor for exponents) - debug_plot (boolean): - if True, a histogram matrix of the individual estimates will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float array: - array of matrix_dim Lyapunov exponents (positive exponents are indicators - for chaos) - 2d-array of floats: - only present if debug_data is True: all estimates for the matrix_dim - Lyapunov exponents from the x iterations of R_i. The shape of this debug - data is (x, matrix_dim). - """ - # convert to float to avoid errors when using 'inf' as distance - data = np.asarray(data, dtype=np.float64) - n = len(data) - if (emb_dim - 1) % (matrix_dim - 1) != 0: - raise ValueError("emb_dim - 1 must be divisible by matrix_dim - 1!") - m = (emb_dim - 1) // (matrix_dim - 1) - if min_nb is None: - # minimal number of neighbors as suggested by Eckmann et al. - min_nb = min(2 * matrix_dim, matrix_dim + 4) - - min_len = lyap_e_len( - emb_dim=emb_dim, matrix_dim=matrix_dim, min_nb=min_nb, min_tsep=min_tsep - ) - if n < min_len: - msg = "{} data points are not enough! For emb_dim = {}, matrix_dim = {}" \ - + ", min_tsep = {} and min_nb = {} you need at least {} data points " \ - + "in your time series" - warnings.warn( - msg.format(n, emb_dim, matrix_dim, min_tsep, min_nb, min_len), - RuntimeWarning - ) +def lyap_e_len(emb_dim: int, matrix_dim: int, min_tsep: int, min_nb: int) -> int: + """Returns the minimum number of data points required to use lyap_e. - # construct orbit as matrix (e = emb_dim) - # x0 x1 x2 ... xe-1 - # x1 x2 x3 ... xe - # x2 x3 x4 ... xe+1 - # ... - - # note: we need to be able to step m points further for the beta vector - # => maximum start index is n - emb_dim - m - orbit = delay_embedding(data[:-m], emb_dim, lag=1) - if len(orbit) < min_nb: - assert len(data) < min_len - msg = "Not enough data points. Need at least {} additional data points " \ - + "to have min_nb = {} neighbor candidates" - raise ValueError(msg.format(min_nb-len(orbit), min_nb)) - old_Q = np.identity(matrix_dim) - lexp = np.zeros(matrix_dim, dtype=np.float64) - lexp_counts = np.zeros(lexp.shape) - debug_values = [] - # TODO reduce number of points to visit? - # TODO performance test! - for i in range(len(orbit)): - # find neighbors for each vector in the orbit using the chebyshev distance - diffs = rowwise_chebyshev(orbit, orbit[i]) - # ensure that we do not count the difference of the vector to itself - diffs[i] = float('inf') - # mask all neighbors that are too close in time to the vector itself - mask_from = max(0, i - min_tsep) - mask_to = min(len(diffs), i + min_tsep + 1) - diffs[mask_from:mask_to] = np.inf - indices = np.argsort(diffs) - idx = indices[min_nb - 1] # index of the min_nb-nearest neighbor - r = diffs[idx] # corresponding distance - if np.isinf(r): - assert len(data) < min_len - msg = "Not enough data points. Orbit vector {} has less than min_nb = " \ - + "{} valid neighbors that are at least min_tsep = {} time steps " \ - + "away. Input must have at least length {}." - raise ValueError(msg.format(i, min_nb, min_tsep, min_len)) - # there may be more than min_nb vectors at distance r (if multiple vectors - # have a distance of exactly r) - # => update index accordingly - indices = np.where(diffs <= r)[0] - - # find the matrix T_i that satisifies - # T_i (orbit'[j] - orbit'[i]) = (orbit'[j+m] - orbit'[i+m]) - # for all neighbors j where orbit'[i] = [x[i], x[i+m], - # ... x[i + (matrix_dim-1)*m]] - - # note that T_i has the following form: - # 0 1 0 ... 0 - # 0 0 1 ... 0 - # ... - # a0 a1 a2 ... a(matrix_dim-1) - - # This is because for all rows except the last one the aforementioned - # equation has a clear solution since orbit'[j+m] - orbit'[i+m] = - # [x[j+m]-x[i+m], x[j+2*m]-x[i+2*m], ... x[j+d_M*m]-x[i+d_M*m]] - # and - # orbit'[j] - orbit'[i] = - # [x[j]-x[i], x[j+m]-x[i+m], ... x[j+(d_M-1)*m]-x[i+(d_M-1)*m]] - # therefore x[j+k*m] - x[i+k*m] is already contained in - # orbit'[j] - orbit'[x] for all k from 1 to matrix_dim-1. Only for - # k = matrix_dim there is an actual problem to solve. - - # We can therefore find a = [a0, a1, a2, ... a(matrix_dim-1)] by - # formulating a linear least squares problem (mat_X * a = vec_beta) - # as follows. - - # build matrix X for linear least squares (d_M = matrix_dim) - # x_j1 - x_i x_j1+m - x_i+m ... x_j1+(d_M-1)m - x_i+(d_M-1)m - # x_j2 - x_i x_j2+m - x_i+m ... x_j2+(d_M-1)m - x_i+(d_M-1)m - # ... + Note that none of the required parameters may be set to None. - # note: emb_dim = (d_M - 1) * m + 1 - mat_X = np.array([data[j:j + emb_dim:m] for j in indices]) - mat_X -= data[i:i + emb_dim:m] + Args: + matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb: minimal number of neighbors + (default: min(2 * matrix_dim, matrix_dim + 4)) + min_tsep: minimal temporal separation between two "neighbors" + emb_dim: embedding dimension + """ + m = (emb_dim - 1) // (matrix_dim - 1) + # minimum length required to find single orbit vector + min_len = emb_dim + # we need to follow each starting point of an orbit vector for m more steps + min_len += m + # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each + min_len += min_tsep * 2 + # we need at least min_nb neighbors for each orbit vector + min_len += min_nb + return min_len + + +@overload +def lyap_e( + data: NumberArrayLike1D, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> FloatArray1D: ... + + +@overload +def lyap_e( + data: NumberArrayLike1D, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +) -> tuple[ + FloatArray1D, + FloatArray2D, +]: ... + + +def lyap_e( # noqa: C901, PLR0915 + data: NumberArrayLike1D, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + FloatArray1D + | tuple[ + FloatArray1D, + FloatArray2D, + ] +): + r"""Estimates the Lyapunov exponents using the algorithm of Eckmann et al. [le_1]_. + + Recommendations for parameter settings by Eckmann et al.: + * long recording time improves accuracy, small tau does not + * use large values for emb_dim + * matrix_dim should be 'somewhat larger than the expected number of + positive Lyapunov exponents' + * min_nb = min(2 * matrix_dim, matrix_dim + 4) + + Explanation of Lyapunov exponents: + The Lyapunov exponent describes the rate of separation of two + infinitesimally close trajectories of a dynamical system in phase space. + In a chaotic system, these trajectories diverge exponentially following + the equation: + + \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| + + In this equation X(t, X_0) is the trajectory of the system X starting at + the point X_0 in phase space at time t. eps is the (infinitesimal) + difference vector and lambda is called the Lyapunov exponent. If the + system has more than one free variable, the phase space is + multidimensional and each dimension has its own Lyapunov exponent. The + existence of at least one positive Lyapunov exponent is generally seen as + a strong indicator for chaos. + + Explanation of the Algorithm: + To calculate the Lyapunov exponents analytically, the Jacobian of the + system is required. The algorithm of Eckmann et al. therefore tries to + estimate this Jacobian by reconstructing the dynamics of the system from + which the time series was obtained. For this, several steps are required: + + * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim + dimensions (map each point x_i of the time series to a vector + [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). + * For each vector X_i in this orbit find a radius r_i so that at least + min_nb other vectors lie within (chebyshev-)distance r_i around X_i. + These vectors will be called "neighbors" of X_i. + * Find the Matrix T_i that sends points from the neighborhood of X_i to + the neighborhood of X_(i+1). To avoid undetermined values in T_i, we + construct T_i not with size (emb_dim x emb_dim) but with size + (matrix_dim x matrix_dim), so that we have a larger "step size" m in the + X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), + ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible + by matrix_dim-1. The T_i are then found by a linear least squares fit, + assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the + neighborhood of X_i. + * Starting with i = 1 and Q_0 = identity successively decompose the matrix + T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. + * Calculate the Lyapunov exponents from the mean of the logarithm of the + diagonal elements of the matrices R_i. To normalize the Lyapunov + exponents, they have to be divided by m and by the step size tau of the + original time series. + + References: + .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, + “Liapunov exponents from time series,” Physical Review A, + vol. 34, no. 6, pp. 4971–4979, 1986. - # build vector beta for linear least squares - # x_j1+(d_M)m - x_i+(d_M)m - # x_j2+(d_M)m - x_i+(d_M)m + Reference code: + .. [le_a] Manfred Füllsack, "Lyapunov exponent", + url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html + .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), + url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m + .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, + url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html + + Args: + data: (scalar) data points + emb_dim: embedding dimension + matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb: minimal number of neighbors + (default: min(2 * matrix_dim, matrix_dim + 4)) + min_tsep: minimal temporal separation between two "neighbors" + tau: step size of the data in seconds + (normalization scaling factor for exponents) + debug_plot: if True, a histogram matrix of the individual estimates will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + Array of matrix_dim Lyapunov exponents (positive exponents are indicators + for chaos). If `debug_data = True`, the return type is a tuple instead + with the first element being the Lyapunov exponents and the second element + being all estimates for the matrix_dim Lyapunov exponents from the x + iterations of R_i. The shape of this debug data is (x, matrix_dim). + """ + # convert to float to avoid errors when using 'inf' as distance + data = np.asarray(data, dtype=float_precision) + n = len(data) + if (emb_dim - 1) % (matrix_dim - 1) != 0: + msg = "emb_dim - 1 must be divisible by matrix_dim - 1!" + raise ValueError(msg) + m = (emb_dim - 1) // (matrix_dim - 1) + if min_nb is None: + # minimal number of neighbors as suggested by Eckmann et al. + min_nb = min(2 * matrix_dim, matrix_dim + 4) + + min_len = lyap_e_len( + emb_dim=emb_dim, + matrix_dim=matrix_dim, + min_nb=min_nb, + min_tsep=min_tsep, + ) + if n < min_len: + msg = ( + "{} data points are not enough! For emb_dim = {}, matrix_dim = {}" + ", min_tsep = {} and min_nb = {} you need at least {} data points " + "in your time series" + ) + warnings.warn( + msg.format(n, emb_dim, matrix_dim, min_tsep, min_nb, min_len), + RuntimeWarning, + stacklevel=2, + ) + + # construct orbit as matrix (e = emb_dim) + # x0 x1 x2 ... xe-1 + # x1 x2 x3 ... xe + # x2 x3 x4 ... xe+1 # ... - if max(np.max(indices), i) + matrix_dim * m >= len(data): - assert len(data) < min_len - msg = "Not enough data points. Cannot follow orbit vector {} for " \ - + "{} (matrix_dim * m) time steps. Input must have at least " \ - + "length {}." - raise ValueError(msg.format(i, matrix_dim * m, min_len)) - vec_beta = data[indices + matrix_dim * m] - data[i + matrix_dim * m] - - # perform linear least squares - a, _, _, _ = np.linalg.lstsq(mat_X, vec_beta, rcond=-1) - # build matrix T - # 0 1 0 ... 0 - # 0 0 1 ... 0 + + # note: we need to be able to step m points further for the beta vector + # => maximum start index is n - emb_dim - m + orbit = delay_embedding(data[:-m], emb_dim, lag=1) + if len(orbit) < min_nb: + assert len(data) < min_len + msg = ( + "Not enough data points. Need at least {} additional data points " + "to have min_nb = {} neighbor candidates" + ) + raise ValueError(msg.format(min_nb - len(orbit), min_nb)) + old_Q = np.identity(matrix_dim) + lexp = np.zeros(matrix_dim, dtype=float_precision) + lexp_counts = np.zeros(lexp.shape) + debug_values = [] + for i in range(len(orbit)): + # find neighbors for each vector in the orbit using the chebyshev distance + diffs = rowwise_chebyshev(orbit, orbit[i]) + # ensure that we do not count the difference of the vector to itself + diffs[i] = float("inf") + # mask all neighbors that are too close in time to the vector itself + mask_from = max(0, i - min_tsep) + mask_to = min(len(diffs), i + min_tsep + 1) + diffs[mask_from:mask_to] = np.inf + indices = np.argsort(diffs) + idx = indices[min_nb - 1] # index of the min_nb-nearest neighbor + r = diffs[idx] # corresponding distance + if np.isinf(r): + assert len(data) < min_len + msg = ( + "Not enough data points. Orbit vector {} has less than min_nb = " + "{} valid neighbors that are at least min_tsep = {} time steps " + "away. Input must have at least length {}." + ) + raise ValueError(msg.format(i, min_nb, min_tsep, min_len)) + # there may be more than min_nb vectors at distance r (if multiple vectors + # have a distance of exactly r) + # => update index accordingly + indices = (diffs <= r).nonzero()[0] + + # find the matrix T_i that satisifies + # T_i (orbit'[j] - orbit'[i]) = (orbit'[j+m] - orbit'[i+m]) + # for all neighbors j where orbit'[i] = [x[i], x[i+m], + # ... x[i + (matrix_dim-1)*m]] + + # note that T_i has the following form: + # 0 1 0 ... 0 + # 0 0 1 ... 0 + # ... + # a0 a1 a2 ... a(matrix_dim-1) + + # This is because for all rows except the last one the aforementioned + # equation has a clear solution since orbit'[j+m] - orbit'[i+m] = + # [x[j+m]-x[i+m], x[j+2*m]-x[i+2*m], ... x[j+d_M*m]-x[i+d_M*m]] + # and + # orbit'[j] - orbit'[i] = + # [x[j]-x[i], x[j+m]-x[i+m], ... x[j+(d_M-1)*m]-x[i+(d_M-1)*m]] + # therefore x[j+k*m] - x[i+k*m] is already contained in + # orbit'[j] - orbit'[x] for all k from 1 to matrix_dim-1. Only for + # k = matrix_dim there is an actual problem to solve. + + # We can therefore find a = [a0, a1, a2, ... a(matrix_dim-1)] by + # formulating a linear least squares problem (mat_X * a = vec_beta) + # as follows. + + # build matrix X for linear least squares (d_M = matrix_dim) + # x_j1 - x_i x_j1+m - x_i+m ... x_j1+(d_M-1)m - x_i+(d_M-1)m + # x_j2 - x_i x_j2+m - x_i+m ... x_j2+(d_M-1)m - x_i+(d_M-1)m + # ... + + # note: emb_dim = (d_M - 1) * m + 1 # noqa: ERA001 + mat_X = np.array([data[j : j + emb_dim : m] for j in indices], dtype=float_precision) + mat_X -= data[i : i + emb_dim : m] + + # build vector beta for linear least squares + # x_j1+(d_M)m - x_i+(d_M)m + # x_j2+(d_M)m - x_i+(d_M)m + # ... + if max(int(np.max(indices)), i) + matrix_dim * m >= len(data): + assert len(data) < min_len + msg = ( + "Not enough data points. Cannot follow orbit vector {} for " + "{} (matrix_dim * m) time steps. Input must have at least " + "length {}." + ) + raise ValueError(msg.format(i, matrix_dim * m, min_len)) + vec_beta = data[indices + matrix_dim * m] - data[i + matrix_dim * m] + + # perform linear least squares + a, _, _, _ = np.linalg.lstsq(mat_X, vec_beta, rcond=-1) + # build matrix T + # 0 1 0 ... 0 + # 0 0 1 ... 0 + # ... + # 0 0 0 ... 1 + # a1 a2 a3 ... a_(d_M) + mat_T = np.zeros((matrix_dim, matrix_dim)) + mat_T[:-1, 1:] = np.identity(matrix_dim - 1) + mat_T[-1] = a + + # QR-decomposition of T * old_Q + mat_Q, mat_R = np.linalg.qr(np.dot(mat_T, old_Q)) + # force diagonal of R to be positive + # (if QR = A then also QLL'R = A with L' = L^-1) + sign_diag = np.sign(np.diag(mat_R)) + sign_diag[np.where(sign_diag == 0)] = 1 + sign_diag = np.diag(sign_diag) + mat_Q = np.dot(mat_Q, sign_diag) + mat_R = np.dot(sign_diag, mat_R) + + old_Q = mat_Q + # successively build sum for Lyapunov exponents + diag_R = np.diag(mat_R) + # filter zeros in mat_R (would lead to -infs) + idx = np.where(diag_R > 0) + lexp_i = np.zeros(diag_R.shape, dtype=float_precision) + lexp_i[idx] = np.log(diag_R[idx]) + lexp_i[np.where(diag_R == 0)] = np.inf + if debug_plot or debug_data: + debug_values.append(lexp_i / tau / m) + lexp[idx] += lexp_i[idx] + lexp_counts[idx] += 1 + # end of loop over orbit vectors + # it may happen that all R-matrices contained zeros => exponent really has + # to be -inf + if debug_plot: + plot_histogram_matrix( + np.array(debug_values, dtype=float_precision), "layp_e", fname=plot_file + ) + # normalize exponents over number of individual mat_Rs + idx = np.where(lexp_counts > 0) + lexp[idx] /= lexp_counts[idx] + lexp[np.where(lexp_counts == 0)] = np.inf + # normalize with respect to tau + lexp /= tau + # take m into account + lexp /= m + if debug_data: + return (lexp, np.array(debug_values, dtype=float_precision)) + return lexp + + +def plot_dists( + dists: list[FloatArray1D], + tolerance: float, + m: int, + title: str | None = None, + fname: str | Path | None = None, +) -> None: + """Plots a histogram per distance array in dists. + + Args: + dists: Distance arrays for which to plot the histograms. + tolerance: Tolerance value for the distance (will be highlighted). + m: Embedding dimension (used for labeling the histograms). + title: Title for the plot (optional). + fname: If not None, the plot will be saved under this file name instead of + showing it directly with ``plt.show()``. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + nstd = 3 + nbins = 50 + dists_full = np.concatenate(dists) + ymax = len(dists_full) * 0.05 + mean = np.mean(dists_full) + std = np.std(dists_full, ddof=1) + rng = (0.0, float(mean + std * nstd)) + colors = ["green", "blue"] + for i, (h, bins) in enumerate([np.histogram(dat, bins=nbins, range=rng) for dat in dists]): + bw = bins[1] - bins[0] + plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) + plt.axvline(tolerance, color="red") + plt.legend(loc="best") + plt.xlabel("distance") + plt.ylabel("count") + plt.ylim(0, ymax) + if title is not None: + plt.title(title) + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +@overload +def sampen( + data: NumberArrayLike1D, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> float: ... + + +@overload +def sampen( + data: NumberArrayLike1D, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: Literal[True], + plot_file: str | Path | None = None, +) -> tuple[ + float, + list[float], + list[FloatArray1D], +]: ... + + +def sampen( # noqa: C901, PLR0912 + data: NumberArrayLike1D, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + float + | tuple[ + float, + list[float], + list[FloatArray1D], + ] +): + """Computes the sample entropy of the given data. + + Explanation of the sample entropy: + The sample entropy of a time series is defined as the negative natural + logarithm of the conditional probability that two sequences similar for + emb_dim points remain similar at the next point, excluding self-matches. + + A lower value for the sample entropy therefore corresponds to a higher + probability indicating more self-similarity. + + Explanation of the algorithm: + The algorithm constructs all subsequences of length emb_dim + [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j + where dist(s_i, s_j) < tolerance. The same process is repeated for all + subsequences of length emb_dim + 1. The sum of similar sequence pairs + with length emb_dim + 1 is divided by the sum of similar sequence pairs + with length emb_dim. The result of the algorithm is the negative logarithm + of this ratio/probability. + + References: + .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series + analysis using approximate entropy and sample entropy,” + American Journal of Physiology-Heart and Circulatory Physiology, + vol. 278, no. 6, pp. H2039–H2049, 2000. + + Reference code: + .. [se_a] "sample_entropy" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + + Args: + data: input data + emb_dim (int): + the embedding dimension (length of vectors to compare) + tolerance (float): + distance threshold for two template vectors to be considered equal + (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect + for other values of emb_dim) + lag (int): + delay for the delay embedding + dist (function (2d-array, 1d-array) -> 1d-array): + distance function used to calculate the distance between template + vectors. Sampen is defined using ``rowwise_chebyshev``. You should only + use something else, if you are sure that you need it. + closed (boolean): + if True, will check for vector pairs whose distance is in the closed + interval [0, r] (less or equal to r), otherwise the open interval + [0, r) (less than r) will be used + debug_plot (boolean): + if True, a histogram of the individual distances for m and m+1 + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + The sample entropy of the data (negative logarithm of ratio between + similar template vectors of length emb_dim + 1 and emb_dim). If + `debug_data` is True, the return type is a tuple instead, containing + + - sampen: the sample entropy + - [c_m, c_m1]: list of two floats: count of similar template vectors of + length emb_dim (c_m) and of length emb_dim + 1 (c_m1) + - [dists_m, dists_m1]: the distances between template vectors for m + (dists_m) and for m + 1 (dists_m1). + """ + data = np.asarray(data, dtype=float_precision) + + if tolerance is None: + # the reasoning behind this default value is the following: + # 1. physionet uses the default values emb_dim = 2, tolerance = 0.2 + # 2. the chebyshev distance rises logarithmically with increasing dimension + # 3. 0.5627 * np.log(emb_dim) + 1.3334 is the logarithmic trend line for + # the chebyshev distance of vectors sampled from a univariate normal + # distribution + # 4. 0.1164 is used as a factor to ensure that tolerance == std * 0.2 for + # emb_dim == 2 # noqa: ERA001 + tolerance = np.std(data, ddof=1) * 0.1164 * (0.5627 * np.log(emb_dim) + 1.3334) + n = len(data) + + # build matrix of "template vectors" + # (all consecutive subsequences of length m) + # x0 x1 x2 x3 ... xm-1 + # x1 x2 x3 x4 ... xm + # x2 x3 x4 x5 ... xm+1 # ... - # 0 0 0 ... 1 - # a1 a2 a3 ... a_(d_M) - mat_T = np.zeros((matrix_dim, matrix_dim)) - mat_T[:-1, 1:] = np.identity(matrix_dim - 1) - mat_T[-1] = a - - # QR-decomposition of T * old_Q - mat_Q, mat_R = np.linalg.qr(np.dot(mat_T, old_Q)) - # force diagonal of R to be positive - # (if QR = A then also QLL'R = A with L' = L^-1) - sign_diag = np.sign(np.diag(mat_R)) - sign_diag[np.where(sign_diag == 0)] = 1 - sign_diag = np.diag(sign_diag) - mat_Q = np.dot(mat_Q, sign_diag) - mat_R = np.dot(sign_diag, mat_R) - - old_Q = mat_Q - # successively build sum for Lyapunov exponents - diag_R = np.diag(mat_R) - # filter zeros in mat_R (would lead to -infs) - idx = np.where(diag_R > 0) - lexp_i = np.zeros(diag_R.shape, dtype=np.float64) - lexp_i[idx] = np.log(diag_R[idx]) - lexp_i[np.where(diag_R == 0)] = np.inf - if debug_plot or debug_data: - debug_values.append(lexp_i / tau / m) - lexp[idx] += lexp_i[idx] - lexp_counts[idx] += 1 - # end of loop over orbit vectors - # it may happen that all R-matrices contained zeros => exponent really has - # to be -inf - if debug_plot: - plot_histogram_matrix(np.array(debug_values), "layp_e", fname=plot_file) - # normalize exponents over number of individual mat_Rs - idx = np.where(lexp_counts > 0) - lexp[idx] /= lexp_counts[idx] - lexp[np.where(lexp_counts == 0)] = np.inf - # normalize with respect to tau - lexp /= tau - # take m into account - lexp /= m - if debug_data: - return (lexp, np.array(debug_values)) - return lexp - - -def plot_dists(dists, tolerance, m, title=None, fname=None): - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - nstd = 3 - nbins = 50 - dists_full = np.concatenate(dists) - ymax = len(dists_full) * 0.05 - mean = np.mean(dists_full) - std = np.std(dists_full, ddof=1) - rng = (0, mean + std * nstd) - i = 0 - colors = ["green", "blue"] - for h, bins in [np.histogram(dat, nbins, rng) for dat in dists]: - bw = bins[1] - bins[0] - plt.bar(bins[:-1], h, bw, label="m={:d}".format(m + i), - color=colors[i], alpha=0.5) - i += 1 - plt.axvline(tolerance, color="red") - plt.legend(loc="best") - plt.xlabel("distance") - plt.ylabel("count") - plt.ylim(0, ymax) - if title is not None: - plt.title(title) - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def sampen(data, emb_dim=2, tolerance=None, lag=1, dist=rowwise_chebyshev, - closed=False, debug_plot=False, debug_data=False, plot_file=None): - """ - Computes the sample entropy of the given data. - - Explanation of the sample entropy: - The sample entropy of a time series is defined as the negative natural - logarithm of the conditional probability that two sequences similar for - emb_dim points remain similar at the next point, excluding self-matches. - - A lower value for the sample entropy therefore corresponds to a higher - probability indicating more self-similarity. - - Explanation of the algorithm: - The algorithm constructs all subsequences of length emb_dim - [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j - where dist(s_i, s_j) < tolerance. The same process is repeated for all - subsequences of length emb_dim + 1. The sum of similar sequence pairs - with length emb_dim + 1 is divided by the sum of similar sequence pairs - with length emb_dim. The result of the algorithm is the negative logarithm - of this ratio/probability. - - References: - .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series - analysis using approximate entropy and sample entropy,” - American Journal of Physiology-Heart and Circulatory Physiology, - vol. 278, no. 6, pp. H2039–H2049, 2000. - - Reference code: - .. [se_a] "sample_entropy" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf - - Args: - data (array-like of float): - input data - - Kwargs: - emb_dim (int): - the embedding dimension (length of vectors to compare) - tolerance (float): - distance threshold for two template vectors to be considered equal - (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect - for other values of emb_dim) - lag (int): - delay for the delay embedding - dist (function (2d-array, 1d-array) -> 1d-array): - distance function used to calculate the distance between template - vectors. Sampen is defined using ``rowwise_chebyshev``. You should only - use something else, if you are sure that you need it. - closed (boolean): - if True, will check for vector pairs whose distance is in the closed - interval [0, r] (less or equal to r), otherwise the open interval - [0, r) (less than r) will be used - debug_plot (boolean): - if True, a histogram of the individual distances for m and m+1 - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float: - the sample entropy of the data (negative logarithm of ratio between - similar template vectors of length emb_dim + 1 and emb_dim) - [c_m, c_m1]: - list of two floats: count of similar template vectors of length emb_dim - (c_m) and of length emb_dim + 1 (c_m1) - [float list, float list]: - Lists of lists of the form ``[dists_m, dists_m1]`` containing the - distances between template vectors for m (dists_m) - and for m + 1 (dists_m1). - """ - data = np.asarray(data) - - if tolerance is None: - # the reasoning behind this default value is the following: - # 1. physionet uses the default values emb_dim = 2, tolerance = 0.2 - # 2. the chebyshev distance rises logarithmically with increasing dimension - # 3. 0.5627 * np.log(emb_dim) + 1.3334 is the logarithmic trend line for - # the chebyshev distance of vectors sampled from a univariate normal - # distribution - # 4. 0.1164 is used as a factor to ensure that tolerance == std * 0.2 for - # emb_dim == 2 - tolerance = np.std(data, ddof=1) * 0.1164 * (0.5627 * np.log(emb_dim) + 1.3334) - n = len(data) - - # build matrix of "template vectors" - # (all consecutive subsequences of length m) - # x0 x1 x2 x3 ... xm-1 - # x1 x2 x3 x4 ... xm - # x2 x3 x4 x5 ... xm+1 - # ... - # x_n-m-1 ... xn-1 - - # since we need two of these matrices for m = emb_dim and m = emb_dim +1, - # we build one that is large enough => shape (emb_dim+1, n-emb_dim) - - # note that we ignore the last possible template vector with length emb_dim, - # because this vector has no corresponding vector of length m+1 and thus does - # not count towards the conditional probability - # (otherwise first dimension would be n-emb_dim+1 and not n-emb_dim) - tVecs = delay_embedding(np.asarray(data), emb_dim+1, lag=lag) - plot_data = [] - counts = [] - for m in [emb_dim, emb_dim + 1]: - counts.append(0) - plot_data.append([]) - # get the matrix that we need for the current m - tVecsM = tVecs[:n - m + 1, :m] - # successively calculate distances between each pair of template vectors - for i in range(len(tVecsM) - 1): - dsts = dist(tVecsM[i + 1:], tVecsM[i]) - if debug_plot or debug_data: - plot_data[-1].extend(dsts) - # count how many distances are smaller than the tolerance - if closed: - counts[-1] += np.sum(dsts <= tolerance) - else: - counts[-1] += np.sum(dsts < tolerance) - if counts[0] > 0 and counts[1] > 0: - saen = -np.log(1.0 * counts[1] / counts[0]) - else: - # log would be infinite or undefined => cannot determine saen - zcounts = [] - if counts[0] == 0: - zcounts.append("emb_dim") - if counts[1] == 0: - zcounts.append("emb_dim + 1") - warnings.warn( - ( - "Zero vectors are within tolerance for %s. " \ - + "Consider raising the tolerance parameter to avoid %s result." - ) % (" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), - RuntimeWarning - ) - if counts[0] == 0 and counts[1] == 0: - saen = np.nan - elif counts[0] == 0: - saen = -np.inf + # x_n-m-1 ... xn-1 + + # since we need two of these matrices for m = emb_dim and m = emb_dim +1, + # we build one that is large enough => shape (emb_dim+1, n-emb_dim) + + # note that we ignore the last possible template vector with length emb_dim, + # because this vector has no corresponding vector of length m+1 and thus does + # not count towards the conditional probability + # (otherwise first dimension would be n-emb_dim+1 and not n-emb_dim) + tVecs = delay_embedding(np.asarray(data), emb_dim + 1, lag=lag) + plot_data = [] + counts = [] + for m in [emb_dim, emb_dim + 1]: + counts.append(0) + plot_data.append([]) + # get the matrix that we need for the current m + tVecsM = tVecs[: n - m + 1, :m] + # successively calculate distances between each pair of template vectors + for i in range(len(tVecsM) - 1): + dsts = dist(tVecsM[i + 1 :], tVecsM[i]) + if debug_plot or debug_data: + plot_data[-1].extend(dsts) + # count how many distances are smaller than the tolerance + if closed: + counts[-1] += np.sum(dsts <= cast("float", tolerance)) + else: + counts[-1] += np.sum(dsts < cast("float", tolerance)) + if counts[0] > 0 and counts[1] > 0: + saen = -np.log(1.0 * counts[1] / counts[0]) else: - saen = np.inf - if debug_plot: - plot_dists(plot_data, tolerance, m, title="sampEn = {:.3f}".format(saen), - fname=plot_file) - if debug_data: - return (saen, counts, plot_data) - else: + # log would be infinite or undefined => cannot determine saen + zcounts = [] + if counts[0] == 0: + zcounts.append("emb_dim") + if counts[1] == 0: + zcounts.append("emb_dim + 1") + warnings.warn( + ( + "Zero vectors are within tolerance for {}. " + "Consider raising the tolerance parameter to avoid {} result." + ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), # noqa: PLR2004 + RuntimeWarning, + stacklevel=2, + ) + if counts[0] == 0 and counts[1] == 0: + saen = np.nan + elif counts[0] == 0: + saen = -np.inf + else: + saen = np.inf + if debug_plot: + plot_dists( + plot_data, + cast("float", tolerance), + m, + title=f"sampEn = {saen:.3f}", + fname=plot_file, + ) + if debug_data: + return (saen, counts, plot_data) return saen -def binary_n(total_N, min_n=50): - """ - Creates a list of values by successively halving the total length total_N - until the resulting value is less than min_n. - - Non-integer results are rounded down. - - Args: - total_N (int): - total length - Kwargs: - min_n (int): - minimal length after division - - Returns: - list of integers: - total_N/2, total_N/4, total_N/8, ... until total_N/2^i < min_n - """ - max_exp = np.log2(1.0 * total_N / min_n) - max_exp = int(np.floor(max_exp)) - return [int(np.floor(1.0 * total_N / (2**i))) for i in range(1, max_exp + 1)] - - -def logarithmic_n(min_n, max_n, factor): - """ - Creates a list of values by successively multiplying a minimum value min_n by - a factor > 1 until a maximum value max_n is reached. - - Non-integer results are rounded down. - - Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) - - Returns: - list of integers: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n - without duplicates - """ - assert max_n > min_n - assert factor > 1 - # stop condition: min * f^x = max - # => f^x = max/min - # => x = log(max/min) / log(f) - max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) - ns = [min_n] - for i in range(max_i + 1): - n = int(np.floor(min_n * (factor ** i))) - if n > ns[-1]: - ns.append(n) - return ns - - -def logmid_n(max_n, ratio=1/4.0, nsteps=15): - """ - Creates an array of integers that lie evenly spaced in the "middle" of the - logarithmic scale from 0 to log(max_n). - - If max_n is very small and/or nsteps is very large, this may lead to - duplicate values which will be removed from the output. - - This function has benefits in hurst_rs, because it cuts away both very small - and very large n, which both can cause problems, and still produces a - logarithmically spaced sequence. - - Args: - max_n (int): - largest possible output value (should be the sequence length when used in - hurst_rs) - - Kwargs: - ratio (float): - width of the "middle" of the logarithmic interval relative to log(max_n). - For example, for ratio=1/2.0 the logarithm of the resulting values will - lie between 0.25 * log(max_n) and 0.75 * log(max_n). - nsteps (float): - (maximum) number of values to take from the specified range - - Returns: - array of int: - a logarithmically spaced sequence of at most nsteps values (may be less, - because only unique values are returned) - """ - l = np.log(max_n) - span = l * ratio - start = l * (1 - ratio) * 0.5 - midrange = start + 1.0*np.arange(nsteps)/nsteps*span - nvals = np.round(np.exp(midrange)).astype("int32") - return np.unique(nvals) - - -def logarithmic_r(min_n, max_n, factor): - """ - Creates a list of values by successively multiplying a minimum value min_n by - a factor > 1 until a maximum value max_n is reached. - - Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) - - Returns: - list of floats: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n - """ - assert max_n > min_n - assert factor > 1 - max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) - return [min_n * (factor ** i) for i in range(max_i + 1)] - - -def expected_rs(n): - """ - Calculates the expected (R/S)_n for white noise for a given n. - - This is used as a correction factor in the function hurst_rs. It uses the - formula of Anis-Lloyd-Peters (see [h_3]_). - - Args: - n (int): - the value of n for which the expected (R/S)_n should be calculated - - Returns: - float: - expected (R/S)_n for white noise - """ - front = (n - 0.5) / n - i = np.arange(1, n) - back = np.sum(np.sqrt((n - i) / i)) - if n <= 340: - middle = math.gamma((n-1) * 0.5) / math.sqrt(math.pi) / math.gamma(n * 0.5) - else: - middle = 1.0 / math.sqrt(n * math.pi * 0.5) - return front * middle * back - - -def expected_h(nvals, fit="RANSAC"): - """ - Uses expected_rs to calculate the expected value for the Hurst exponent h - based on the values of n used for the calculation. - - Args: - nvals (iterable of int): - the values of n used to calculate the individual (R/S)_n - - KWargs: - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - - Returns: - float: - expected h for white noise - """ - rsvals = [expected_rs(n) for n in nvals] - poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit) - return poly[0] - - -def rs(data, n, unbiased=True): - """ - Calculates an individual R/S value in the rescaled range approach for - a given n. - - Note: This is just a helper function for hurst_rs and should not be called - directly. - - Args: - data (array-like of float): - time series - n (float): - size of the subseries in which data should be split - - Kwargs: - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. - - Returns: - float: - (R/S)_n - """ - data = np.asarray(data) - total_N = len(data) - m = total_N // n # number of sequences - # cut values at the end of data to make the array divisible by n - data = data[:total_N - (total_N % n)] - # split remaining data into subsequences of length n - seqs = np.reshape(data, (m, n)) - # calculate means of subsequences - means = np.mean(seqs, axis=1) - # normalize subsequences by substracting mean - y = seqs - means.reshape((m, 1)) - # build cumulative sum of subsequences - y = np.cumsum(y, axis=1) - # find ranges - r = np.max(y, axis=1) - np.min(y, axis=1) - # find standard deviation - # we should use the unbiased estimator, since we do not know the true mean - s = np.std(seqs, axis=1, ddof=1 if unbiased else 0) - # some ranges may be zero and have to be excluded from the analysis - idx = np.where(r != 0) - r = r[idx] - s = s[idx] - # it may happen that all ranges are zero (if all values in data are equal) - if len(r) == 0: - return np.nan - else: +def binary_n(total_N: int, min_n: int = 50) -> list[int]: + """Creates a list of values by successively halving the total length total_N. + + The iteration stops when the resulting value is less than min_n. Non-integer + results are rounded down. + + Args: + total_N: total length + min_n: minimal length after division + + Returns: + [total_N/2, total_N/4, total_N/8, ...] until total_N/2^i < min_n + """ + max_exp = np.log2(1.0 * total_N / min_n) + max_exp = int(np.floor(max_exp)) + return [int(np.floor(1.0 * total_N / (2**i))) for i in range(1, max_exp + 1)] + + +def logarithmic_n(min_n: int, max_n: int, factor: float) -> list[int]: + """Creates a list of window sizes that are equidistant on a logarithmic scale. + + The values are calculated by multiplying a minimum value min_n by a factor > 1 + until a maximum value max_n is reached. + + Non-integer results are rounded down. + + Args: + min_n: minimum value (must be < max_n) + max_n: maximum value (must be > min_n) + factor: factor used to increase min_n (must be > 1) + + Returns: + [min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i] where + all values are < max_n. Duplicates (due to step sizes less than 1) are + discarded. + """ + if max_n <= min_n: + msg = f"max_n must be larger than min_n ({max_n} <= {min_n})." + raise ValueError(msg) + if factor <= 1: + msg = f"Factor must be larger than 1, but got {factor}." + raise ValueError(msg) + # stop condition: min * f^x = max + # => f^x = max/min + # => x = log(max/min) / log(f) + max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) + ns = [min_n] + for i in range(max_i + 1): + n = int(np.floor(min_n * (factor**i))) + if n > ns[-1]: + ns.append(n) + return ns + + +def logmid_n(max_n: int, ratio: float = 1 / 4.0, nsteps: int = 15) -> IntArray1D: + """Creates an array of equidistant values in the "middle" of [0, max_n] on a logarithmic scale. + + If max_n is very small and/or nsteps is very large, this may lead to + duplicate values which will be removed from the output. + + This function has benefits in hurst_rs, because it cuts away both very small + and very large n, which both can cause problems, and still produces a + sequence that is equidistant on the logarithmic scale. + + Args: + max_n: largest possible output value (should be the sequence length when + used in hurst_rs) + ratio: width of the "middle" of the logarithmic interval relative to log(max_n). + For example, for ratio=1/2.0 the logarithm of the resulting values will + lie between 0.25 * log(max_n) and 0.75 * log(max_n). + nsteps: (maximum) number of values to take from the specified range + + Returns: + A logarithmically spaced sequence of at most nsteps values (may be less + because only unique values are returned). + """ + logmax = np.log(max_n) + span = logmax * ratio + start = logmax * (1 - ratio) * 0.5 + midrange = start + 1.0 * np.arange(nsteps) / nsteps * span + nvals = np.round(np.exp(midrange)).astype(np.int32) + return np.unique(nvals) + + +def logarithmic_r(min_r: float, max_r: float, factor: float) -> list[float]: + """Creates a list of real values that are equidistant on a logarithmic scale. + + The values are generated by successively multiplying a minimum value min_n by + a factor > 1 until a maximum value max_n is reached. + + Args: + min_r: minimum value (must be < max_r) + max_r: maximum value (must be > min_r) + factor: factor used to increase min_r (must be > 1) + + Returns: + [min_r, min_r * factor, min_r * factor^2, ... min_r * factor^i] where + all values are < max_r. + """ + if max_r <= min_r: + msg = f"max_r must be larger than min_r ({max_r} <= {min_r})." + raise ValueError(msg) + if factor <= 1: + msg = f"Factor must be larger than 1, but got {factor}." + raise ValueError(msg) + max_i = int(np.floor(np.log(1.0 * max_r / min_r) / np.log(factor))) + return [min_r * (factor**i) for i in range(max_i + 1)] + + +def expected_rs(n: np.integer) -> float: + """Approximates the expected (R/S)_n for white noise for a given n. + + This is used as a correction factor in the function hurst_rs. It uses the + formula of Anis-Lloyd-Peters (see [h_3]_). + + Args: + n: the value of n for which the expected (R/S)_n should be calculated + + Returns: + expected (R/S)_n for white noise + """ + front = (n - 0.5) / n + i = np.arange(1, n) + back = np.sum(np.sqrt((n - i) / i)) + small = 340 # small values behave differently + if n <= small: + middle = math.gamma((n - 1) * 0.5) / math.sqrt(math.pi) / math.gamma(n * 0.5) + else: + middle = 1.0 / math.sqrt(n * math.pi * 0.5) + return front * middle * back + + +def expected_h( + nvals: IntArrayLike1D, + fit: FittingMethod = "RANSAC", + random_state: int | None = None, +) -> float: + """Uses expected_rs to calculate the expected value for the Hurst exponent h. + + Args: + nvals: The values of n used to calculate the individual (R/S)_n + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + random_state: Seed for random number generator used for RANSAC + + + Returns: + expected h for white noise + """ + nvals = np.asarray(nvals, dtype=int_precision) + rsvals = [expected_rs(n) for n in nvals] + poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit, random_state=random_state) + return poly[0] + + +def rs(data: FloatArray1D, n: np.integer, *, unbiased: bool = True) -> float: + """Calculates an individual R/S value in the rescaled range approach for a given n. + + Note: This is just a helper function for hurst_rs and should not be called + directly. + + Args: + data: time series + n: size of the subseries in which data should be split + unbiased: if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. + + Returns: + (R/S)_n + """ + data = np.asarray(data, dtype=float_precision) + total_N = len(data) + m = total_N // n # number of sequences + # cut values at the end of data to make the array divisible by n + data = data[: total_N - (total_N % n)] + # split remaining data into subsequences of length n + seqs = np.reshape(data, (m, n)) + # calculate means of subsequences + means = np.mean(seqs, axis=1) + # normalize subsequences by substracting mean + y = seqs - means.reshape((m, 1)) + # build cumulative sum of subsequences + y = np.cumsum(y, axis=1) + # find ranges + r = np.max(y, axis=1) - np.min(y, axis=1) + # find standard deviation + # we should use the unbiased estimator, since we do not know the true mean + s = np.std(seqs, axis=1, ddof=1 if unbiased else 0) + # some ranges may be zero and have to be excluded from the analysis + idx = np.where(r != 0) + r = r[idx] + s = s[idx] + # it may happen that all ranges are zero (if all values in data are equal) + if len(r) == 0: + return np.nan # return mean of r/s along subsequence index return np.mean(r / s) -def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None): - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - nhists = len(data[0]) - nbins = 25 - ylim = (0, 0.5) - nrows = int(np.ceil(np.sqrt(nhists))) - plt.figure(figsize=(nrows * 4, nrows * 4)) - for i in range(nhists): - plt.subplot(nrows, nrows, i + 1) - absmax = max(abs(np.max(data[:, i])), abs(np.min(data[:, i]))) - if bin_range == "absmax": - rng = (-absmax, absmax) - elif bin_range.endswith("sigma"): - n = int(bin_range[:-len("sigma")]) - mu = np.mean(data[:,i]) - sigma = np.std(data[:, i], ddof=1) - rng = (mu - n * sigma, mu + n * sigma) - h, bins = np.histogram(data[:, i], nbins, rng) - bin_width = bins[1] - bins[0] - h = h.astype(np.float64) / np.sum(h) - plt.bar(bins[:-1], h, bin_width) - plt.axvline(np.mean(data[:, i]), color="red") - plt.ylim(ylim) - plt.title("{:s}[{:d}]".format(name, i)) - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg(xvals, yvals, poly, x_label="x", y_label="y", data_label="data", - reg_label="regression line", fname=None): - """ - Helper function to plot trend lines for line-fitting approaches. This - function will show a plot through ``plt.show()`` and close it after the - window has been closed by the user. - - Args: - xvals (list/array of float): - list of x-values - yvals (list/array of float): - list of y-values - poly (list/array of float): - polynomial parameters as accepted by ``np.polyval`` - Kwargs: - x_label (str): - label of the x-axis - y_label (str): - label of the y-axis - data_label (str): - label of the data - reg_label(str): - label of the regression line - fname (str): - file name (if not None, the plot will be saved to disc instead of - showing it though ``plt.show()``) - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - plt.plot(xvals, yvals, "bo", label=data_label) - if not (poly is None): - plt.plot(xvals, np.polyval(poly, xvals), "r-", label=reg_label) - plt.xlabel(x_label) - plt.ylabel(y_label) - plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg_tiled(xvals, yvals, polys, x_label="x", y_label="y", - data_labels=None, reg_labels=None, fname=None, - columns=None): - """ - TODO - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - max_span = max([np.max(y) - np.min(y) for y in yvals]) - means = [np.mean(y) for y in yvals] - if columns is None: - columns = min(4, int(np.ceil(np.sqrt(len(xvals))))) - if data_labels is None: - data_labels = ["data"] * len(xvals) - if reg_labels is None: - reg_labels = ["regression line"] * len(xvals) - for i in range(len(xvals)): - plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) - plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) - if not (polys is None): - plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) +def plot_histogram_matrix( + data: FloatArray2D, + name: str, + bin_range: Literal["absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma"] = "3sigma", + fname: str | Path | None = None, +) -> None: + """Plot a quadratic matrix of histograms. + + Args: + data: matrix of shape (N, K) where K is the number of histograms and N is the size of + a single dimension of which to take a histogram. + name: Title of the plots. + bin_range: How to determine the range of the histogram. "absmax" uses the absolute + maximum and minimum, while Xsigma cuts off values outside the X sigma range + assuming a normal distributed dataset. + fname: File name to use to store the plot. If this is not given, the plot is displayed + with show() instead. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + nhists = len(data[0]) + nbins = 25 + ylim = (0, 0.5) + nrows = int(np.ceil(np.sqrt(nhists))) + plt.figure(figsize=(nrows * 4, nrows * 4)) + for i in range(nhists): + plt.subplot(nrows, nrows, i + 1) + absmax = max(float(abs(np.max(data[:, i]))), float(abs(np.min(data[:, i])))) + if bin_range == "absmax": + rng = (-absmax, absmax) + elif bin_range.endswith("sigma"): + n = int(bin_range[: -len("sigma")]) + mu = np.mean(data[:, i]) + sigma = np.std(data[:, i], ddof=1) + rng = (float(mu - n * sigma), float(mu + n * sigma)) + h, bins = np.histogram(data[:, i], nbins, rng) + bin_width = bins[1] - bins[0] + h = h.astype(float_precision) / np.sum(h) + plt.bar(bins[:-1], h, bin_width) + plt.axvline(float(np.mean(data[:, i])), color="red") + plt.ylim(ylim) + plt.title(f"{name:s}[{i:d}]") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def plot_reg( + xvals: FloatArray1D, + yvals: FloatArray1D, + poly: FloatArray1D | None = None, + x_label: str = "x", + y_label: str = "y", + data_label: str = "data", + reg_label: str = "regression line", + fname: str | Path | None = None, +) -> None: + """Plots trend lines for line-fitting approaches. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless `fname` is provided, in which case + the plot will be saved to disc under the given file name instead. + + Args: + xvals: list of x-values + yvals: list of y-values + poly: polynomial parameters as accepted by ``np.polyval`` + x_label: label of the x-axis + y_label: label of the y-axis + data_label: label of the data + reg_label: label of the regression line + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + plt.plot(xvals, yvals, "bo", label=data_label) + if poly is not None: + plt.plot(xvals, np.polyval(poly, xvals), "r-", label=reg_label) plt.xlabel(x_label) plt.ylabel(y_label) - plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg_multiple(xvals, yvals, polys, x_label="x", y_label="y", - data_labels=None, reg_labels=None, fname=None): - """ - TODO - """ - import matplotlib.pyplot as plt - if data_labels is None: - data_labels = ["data"] * len(xvals) - if reg_labels is None: - reg_labels = ["regression line"] * len(xvals) - for i in range(len(xvals)): - plt.plot(xvals[i], yvals[i], "+", label=data_labels[i]) - if not (polys is None): - plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), label=reg_labels[i]) - plt.xlabel(x_label) - plt.ylabel(y_label) - plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def hurst_rs(data, nvals=None, fit="RANSAC", debug_plot=False, - debug_data=False, plot_file=None, corrected=True, unbiased=True): - """ - Calculates the Hurst exponent by a standard rescaled range (R/S) approach. - - Explanation of Hurst exponent: - The Hurst exponent is a measure for the "long-term memory" of a - time series, meaning the long statistical dependencies in the data that do - not originate from cycles. - - It originates from H.E. Hursts observations of the problem of long-term - storage in water reservoirs. If x_i is the discharge of a river in year i - and we observe this discharge for N years, we can calculate the storage - capacity that would be required to keep the discharge steady at its mean - value. - - To do so, we first subtract the mean over all x_i from the individual - x_i to obtain the departures x'_i from the mean for each year i. As the - excess or deficit in discharge always carries over from year i to year i+1, - we need to examine the cumulative sum of x'_i, denoted by y_i. This - cumulative sum represents the filling of our hypothetical storage. If the - sum is above 0, we are storing excess discharge from the river, if it is - below zero we have compensated a deficit in discharge by releasing - water from the storage. The range (maximum - minimum) R of y_i therefore - represents the total capacity required for the storage. - - Hurst showed that this value follows a steady trend for varying N if it - is normalized by the standard deviation sigma over the x_i. Namely he - obtained the following formula: - - R/sigma = (N/2)^K - - In this equation, K is called the Hurst exponent. Its value is 0.5 for - white noise, but becomes greater for time series that exhibit some positive - dependency on previous values. For negative dependencies it becomes less - than 0.5. - - Explanation of the algorithm: - The rescaled range (R/S) approach is directly derived from Hurst's - definition. The time series of length N is split into non-overlapping - subseries of length n. Then, R and S (S = sigma) are calculated for each - subseries and the mean is taken over all subseries yielding (R/S)_n. This - process is repeated for several lengths n. Finally, the exponent K is - obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). - - There seems to be no consensus how to chose the subseries lenghts n. - This function therefore leaves the choice to the user. The module provides - some utility functions for "typical" values: - - * binary_n: N/2, N/4, N/8, ... - * logarithmic_n: min_n, min_n * f, min_n * f^2, ... - - References: - .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” - International Association of Scientific Hydrology. Bulletin, vol. 1, - no. 3, pp. 13–27, 1956. - .. [h_2] H. E. Hurst, “A suggested statistical model of some time series - which occur in nature,” Nature, vol. 180, p. 494, 1957. - .. [h_3] R. Weron, “Estimating long-range dependence: finite sample - properties and confidence intervals,” Physica A: Statistical Mechanics - and its Applications, vol. 312, no. 1, pp. 285–299, 2002. - - Reference Code: - .. [h_a] "hurst" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf - - Note: Pracma yields several estimates of the Hurst exponent, which - are listed below. Unless otherwise stated they use the divisors - of the length of the sequence as n. The length is reduced by at - most 1% to find the value that has the most divisors. - - * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for - n = N. - * The "theoretical Hurst exponent" is the value that would be - expected of an uncorrected rescaled range approach for random - noise of the size of the input data. - * The "empirical Hurst exponent" is the uncorrected Hurst exponent - obtained by the rescaled range approach. - * The "corrected empirical Hurst exponent" is the - Anis-Lloyd-Peters corrected Hurst exponent, but with - sqrt(1/2 * pi * n) added to the (R/S)_n before the log. - * The "corrected R over S Hurst exponent" uses the R-function "lm" - instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... - by successively halving the subsequences (which means that some - subsequences may be one element longer than others). In contrast - to its name it does not use the Anis-Lloyd-Peters correction - factor. - - If you want to compare the output of pracma to the output of - nolds, the "empirical hurst exponent" is the only measure that - exactly corresponds to the Hurst measure implemented in nolds - (by choosing corrected=False, fit="poly" and employing the same - strategy for choosing n as the divisors of the (reduced) - sequence length). - .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst - exponent using R/S Analysis", - url: https://ideas.repec.org/c/wuu/hscode/m11003.html - - Note: When the same values for nvals are used and fit is set to - "poly", nolds yields exactly the same results as this - implementation. - .. [h_c] Bill Davidson, "Hurst exponent", - url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent - - Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - sizes of subseries to use - (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 - logarithmically spaced values in the medium 25% of the logarithmic range) - - Generally, the choice for n is a trade-off between the length and the - number of the subsequences that are used for the calculation of the - (R/S)_n. Very low values of n lead to high variance in the ``r`` and - ``s`` while very high values may leave too few subsequences that the mean - along them is still meaningful. Logarithmic spacing makes sense, because - it translates to even spacing in the log-log-plot. - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - corrected (boolean): - if True, the Anis-Lloyd-Peters correction factor will be applied to the - output according to the expected value for the individual (R/S)_n - (see [h_3]_) - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. - - Returns: - float: - estimated Hurst exponent K using a rescaled range approach (if K = 0.5 - there are no long-range correlations in the data, if K < 0.5 there are - negative long-range correlations, if K > 0.5 there are positive - long-range correlations) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, rsvals, poly)`` where ``nvals`` are the values used for log(n), - ``rsvals`` are the corresponding log((R/S)_n) and ``poly`` are the line - coefficients (``[slope, intercept]``) - """ - data = np.asarray(data) - total_N = len(data) - if nvals is None: - # chooses a default value for nvals that will give 15 logarithmically - # spaced datapoints leaning towards the middle of the logarithmic range - # (since both too small and too large n introduce too much variance) - nvals = logmid_n(total_N, ratio=1/4.0, nsteps=15) - # get individual values for (R/S)_n - rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals]) - # filter NaNs (zeros should not be possible, because if R is 0 then - # S is also zero) - not_nan = np.logical_not(np.isnan(rsvals)) - rsvals = rsvals[not_nan] - nvals = np.asarray(nvals)[not_nan] - # it may happen that no rsvals are left (if all values of data are the same) - if len(rsvals) == 0: - poly = [np.nan, np.nan] - if debug_plot: - warnings.warn( - "Cannot display debug plot, all (R/S)_n are NaN", - RuntimeWarning - ) - else: - # fit a line to the logarithm of the obtained (R/S)_n - xvals = np.log(nvals) - yvals = np.log(rsvals) - if corrected: - yvals -= np.log([expected_rs(n) for n in nvals]) - poly = poly_fit(xvals, yvals, 1, fit=fit) - if debug_plot: - plot_reg(xvals, yvals, poly, "log(n)", "log((R/S)_n)", - fname=plot_file) - # account for correction if necessary - h = poly[0] + 0.5 if corrected else poly[0] - # return line slope (+ correction) as hurst exponent - if debug_data: - return (h, (np.log(nvals), np.log(rsvals), poly)) - else: + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +# TODO: this is not used anywhere. Do we still need it? +def plot_reg_tiled( + xvals: FloatArray2D, + yvals: FloatArray2D, + polys: list[FloatArray1D] | None = None, + x_label: str = "x", + y_label: str = "y", + data_labels: list[str] | None = None, + reg_labels: list[str] | None = None, + fname: str | Path | None = None, + columns: int | None = None, +) -> None: + """Plots trend lines for multiple line-fitting approaches in a tiled layout. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless `fname` is provided, in which case + the plot will be saved to disc under the given file name instead. + + Args: + xvals: values on the x-axis in shape (#plots, #datapoints) + yvals: values on the y-axis in shape (#plots, #datapoints) + polys: polynomial parameters as accepted by ``np.polyval`` in shape + (#plots, #params) + x_label: x axis label + y_label: y axis label + data_labels: labels of the povided datasets + reg_labels: labels of the regression lines + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + columns: number of columns for the tiled view, defaults to minimum + number required to obtain a square grid. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + max_span = max([float(np.max(y)) - float(np.min(y)) for y in yvals]) + means = [np.mean(y) for y in yvals] + if columns is None: + columns = min(4, int(np.ceil(np.sqrt(len(xvals))))) + if data_labels is None: + data_labels = ["data"] * len(xvals) + if reg_labels is None: + reg_labels = ["regression line"] * len(xvals) + for i in range(len(xvals)): + plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) + plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) + if polys is not None: + plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) + plt.xlabel(x_label) + plt.ylabel(y_label) + plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) + plt.legend(loc="best") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def plot_reg_multiple( + xvals: FloatArray2D, + yvals: FloatArray2D, + polys: FloatArray2D | None = None, + x_label: str = "x", + y_label: str = "y", + data_labels: list[str] | None = None, + reg_labels: list[str] | None = None, + fname: str | Path | None = None, +) -> None: + """Plots trend lines for multiple line-fitting approaches in a the same plot. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless``fname` is provided, in which case + the plot will be saved to disc under the given file name instead. + + Args: + xvals: values on the x-axis in shape (#plots, #datapoints) + yvals: values on the y-axis in shape (#plots, #datapoints) + polys: polynomial parameters as accepted by ``np.polyval`` in shape + (#plots, #params) + x_label: x axis label + y_label: y axis label + data_labels: labels of the povided datasets + reg_labels: labels of the regression lines + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + """ + import matplotlib.pyplot as plt + + if data_labels is None: + data_labels = ["data"] * len(xvals) + if reg_labels is None: + reg_labels = ["regression line"] * len(xvals) + for i in range(len(xvals)): + plt.plot(xvals[i], yvals[i], "+", label=data_labels[i]) + if polys is not None: + plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), label=reg_labels[i]) + plt.xlabel(x_label) + plt.ylabel(y_label) + plt.legend(loc="best") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +@overload +def hurst_rs( + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, + random_state: int | None = None, +) -> float: ... + + +@overload +def hurst_rs( + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, + random_state: int | None = None, +) -> tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], +]: ... + + +def hurst_rs( + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, + random_state: int | None = None, +) -> ( + float + | tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], + ] +): + """Calculates the Hurst exponent by a standard rescaled range (R/S) approach. + + Explanation of Hurst exponent: + The Hurst exponent is a measure for the "long-term memory" of a + time series, meaning the long statistical dependencies in the data that do + not originate from cycles. + + It originates from H.E. Hursts observations of the problem of long-term + storage in water reservoirs. If x_i is the discharge of a river in year i + and we observe this discharge for N years, we can calculate the storage + capacity that would be required to keep the discharge steady at its mean + value. + + To do so, we first subtract the mean over all x_i from the individual + x_i to obtain the departures x'_i from the mean for each year i. As the + excess or deficit in discharge always carries over from year i to year i+1, + we need to examine the cumulative sum of x'_i, denoted by y_i. This + cumulative sum represents the filling of our hypothetical storage. If the + sum is above 0, we are storing excess discharge from the river, if it is + below zero we have compensated a deficit in discharge by releasing + water from the storage. The range (maximum - minimum) R of y_i therefore + represents the total capacity required for the storage. + + Hurst showed that this value follows a steady trend for varying N if it + is normalized by the standard deviation sigma over the x_i. Namely he + obtained the following formula: + + R/sigma = (N/2)^K + + In this equation, K is called the Hurst exponent. Its value is 0.5 for + white noise, but becomes greater for time series that exhibit some positive + dependency on previous values. For negative dependencies it becomes less + than 0.5. + + Explanation of the algorithm: + The rescaled range (R/S) approach is directly derived from Hurst's + definition. The time series of length N is split into non-overlapping + subseries of length n. Then, R and S (S = sigma) are calculated for each + subseries and the mean is taken over all subseries yielding (R/S)_n. This + process is repeated for several lengths n. Finally, the exponent K is + obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). + + There seems to be no consensus how to chose the subseries lenghts n. + This function therefore leaves the choice to the user. The module provides + some utility functions for "typical" values: + + * binary_n: N/2, N/4, N/8, ... + * logarithmic_n: min_n, min_n * f, min_n * f^2, ... + + References: + .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” + International Association of Scientific Hydrology. Bulletin, vol. 1, + no. 3, pp. 13–27, 1956. + .. [h_2] H. E. Hurst, “A suggested statistical model of some time series + which occur in nature,” Nature, vol. 180, p. 494, 1957. + .. [h_3] R. Weron, “Estimating long-range dependence: finite sample + properties and confidence intervals,” Physica A: Statistical Mechanics + and its Applications, vol. 312, no. 1, pp. 285–299, 2002. + + Reference Code: + .. [h_a] "hurst" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + + Note: Pracma yields several estimates of the Hurst exponent, which + are listed below. Unless otherwise stated they use the divisors + of the length of the sequence as n. The length is reduced by at + most 1% to find the value that has the most divisors. + + * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for + n = N. + * The "theoretical Hurst exponent" is the value that would be + expected of an uncorrected rescaled range approach for random + noise of the size of the input data. + * The "empirical Hurst exponent" is the uncorrected Hurst exponent + obtained by the rescaled range approach. + * The "corrected empirical Hurst exponent" is the + Anis-Lloyd-Peters corrected Hurst exponent, but with + sqrt(1/2 * pi * n) added to the (R/S)_n before the log. + * The "corrected R over S Hurst exponent" uses the R-function "lm" + instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... + by successively halving the subsequences (which means that some + subsequences may be one element longer than others). In contrast + to its name it does not use the Anis-Lloyd-Peters correction + factor. + + If you want to compare the output of pracma to the output of + nolds, the "empirical hurst exponent" is the only measure that + exactly corresponds to the Hurst measure implemented in nolds + (by choosing corrected=False, fit="poly" and employing the same + strategy for choosing n as the divisors of the (reduced) + sequence length). + .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst + exponent using R/S Analysis", + url: https://ideas.repec.org/c/wuu/hscode/m11003.html + + Note: When the same values for nvals are used and fit is set to + "poly", nolds yields exactly the same results as this + implementation. + .. [h_c] Bill Davidson, "Hurst exponent", + url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent + + Args: + data: time series + nvals: sizes of subseries to use + (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 + logarithmically spaced values in the medium 25% of the logarithmic range) + + Generally, the choice for n is a trade-off between the length and the + number of the subsequences that are used for the calculation of the + (R/S)_n. Very low values of n lead to high variance in the ``r`` and + ``s`` while very high values may leave too few subsequences that the mean + along them is still meaningful. Logarithmic spacing makes sense, because + it translates to even spacing in the log-log-plot. + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + corrected: if True, the Anis-Lloyd-Peters correction factor will be applied to the + output according to the expected value for the individual (R/S)_n + (see [h_3]_) + unbiased: if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. + random_state: Seed for random number generator used for RANSAC + + Returns: + Estimated Hurst exponent K using a rescaled range approach (if K = 0.5 + there are no long-range correlations in the data, if K < 0.5 there are + negative long-range correlations, if K > 0.5 there are positive + long-range correlations), + + If ``debug_data`` is True, the return value is instead a tuple containing + + * the Hurst exponent K + * a tuple of three arrays: + - nvals: the values used for n + - rsvals: the corresponding (R/S)_n values + - poly: the coefficients of the line fit (``[slope, intercept]`` + """ + data = np.asarray(data, dtype=float_precision) + total_N = len(data) + if nvals is None: + # chooses a default value for nvals that will give 15 logarithmically + # spaced datapoints leaning towards the middle of the logarithmic range + # (since both too small and too large n introduce too much variance) + nvals = logmid_n(total_N, ratio=1 / 4.0, nsteps=15) + else: + nvals = np.array(nvals, dtype=int_precision) + # get individual values for (R/S)_n + rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals], dtype=float_precision) + # filter NaNs (zeros should not be possible, because if R is 0 then + # S is also zero) + not_nan = np.logical_not(np.isnan(rsvals)) + rsvals = rsvals[not_nan] + nvals = np.asarray(nvals, dtype=int_precision)[not_nan] + # it may happen that no rsvals are left (if all values of data are the same) + if len(rsvals) == 0: + poly = np.array([np.nan, np.nan], dtype=float_precision) + if debug_plot: + warnings.warn( + "Cannot display debug plot, all (R/S)_n are NaN", + RuntimeWarning, + stacklevel=2, + ) + else: + # fit a line to the logarithm of the obtained (R/S)_n + xvals = np.log(nvals) + yvals = np.log(rsvals) + if corrected: + yvals -= np.log([expected_rs(n) for n in nvals]) + poly = poly_fit(xvals, yvals, 1, fit=fit, random_state=random_state) + if debug_plot: + plot_reg(xvals, yvals, poly, "log(n)", "log((R/S)_n)", fname=plot_file) + # account for correction if necessary + h = poly[0] + 0.5 if corrected else poly[0] + # return line slope (+ correction) as hurst exponent + if debug_data: + return (h, (np.log(nvals), np.log(rsvals), poly)) return h -# TODO implement MFDFA as second (more reliable) measure for multifractality -# NOTE: probably not needed, since mfhurst_b is already pretty reliable - - -def mfhurst_b(data, qvals=None, dists=None, fit='poly', - debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the Generalized Hurst Exponent H_q for different q according to - A.-L. Barabási and T. Vicsek. - - Explanation of the Generalized Hurst Exponent: - The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) - be seen as a generalization of the Hurst exponent for data series with - multifractal properties. It's origins are however not directly related - to Hurst's rescaled range approach, but to the definition of self-affine - functions. - - A single-valued self-affine function h by definition satisfies the relation - - h(x) ~= lambda^(-H) h(lambda x) - - for any positive real valued lambda and some positive real valued exponent - H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent - in the literature. In other words you can view lambda as a scaling factor - or "step size". With lambda < 1 we decrease the step size and zoom into our - function. In this case lambda^(-H) becomes greater than one, meaning that - h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we - zoom out and get lambda^(-H) < 1. - - To calculate H, you can use the height-height correlation function (also - called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x - denotes the expected value over x. Here, the aforementioned self-affine - property is equivalent to c(d) ~ d^(2H). You can also think of d as a step - size. Increasing or decreasing d from 1 to some y is the same as setting - lambda = y: It increases or decreases the scale of the function by a factor - of 1/y^(-H) = y^H. Therefore the squared differences will be proportional - to y^2H. - - A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy - of exponents H_q for the qth-order correlation function with - - c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) - - With q = 1 you get a value H_1 that is closely related to the normal Hurst - exponent, but with different q you either get a constant value H_q = H_0 - independent of q, which indicates that the function has no multifractal - properties, or different H_q, which is a sign for multifractal behavior. - - T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to - financial data series and gave it the name "Generalized Hurst Exponent". - - Explanation of the Algorithm: - Curiously, I could not find any algorithmic description how to calculate - H_q in the literature. Researchers seem to just imply that you can obtain - the exponent by a line fitting algorithm in a log-log plot, but they do not - talk about the actual procedure or the required parameters. - - Essentially, we can calculate c_q(d) of a discrete evenly sampled time - series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences - [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to - the qth power and taking the mean. - - Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) - and get - - log(c_q(d)) ~ log(d) * q H_q - - So in other words if we plot log(c_q(d)) against log(d) for several d we - should get a straight line with slope q H_q. This enables us to use a - linear least squares algorithm to obtain H_q. - - Note that we consider x as a discrete variable in the range 0 <= x < N. - We can do this, because the actual sampling rate of our data series does - not alter the result. After taking the logarithm any scaling factor delta_x - would only result in an additive term since - log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope - of the line and not the intercept. - - References: - .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - - Args: - data (array-like of float): - time series of data points (should be evenly sampled) - - Kwargs: - qvals (iterable of float or int): - values of q for which H_q should be calculated (default: [1]) - dists (iterable of int): - distances for which the height-height correlation should be calculated - (determines the x-coordinates in the log-log plot) - default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure - even spacing on the logarithmic axis - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - array of float: - list of H_q for every q given in ``qvals`` - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. - """ - # transform to array if necessary - data = np.asarray(data, dtype=np.float64) - if qvals is None: - # actual default parameter would introduce shared list - # see: http://pylint-messages.wikidot.com/messages:w0102 - qvals = [1] - if dists is None: - dists = logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) - dists = np.asarray(dists) - if len(data) < 60: - warnings.warn( - "H(q) is not reliable for small time series ({} < 60)".format(len(data)) - ) - def hhcorr(d, q): - diffs = np.abs(data[:-d] - data[d:]) - diffs = diffs[np.where(diffs > 0)] - return np.mean(diffs ** q) - - # calculate height-height correlations - corrvals = [hhcorr(d, q) for d in dists for q in qvals] - corrvals = np.array(corrvals, dtype=np.float64) - corrvals = corrvals.reshape(len(dists), len(qvals)) - - # line fitting - xvals = np.log(dists) - yvals = np.log(corrvals) - polys = [ - poly_fit(xvals, yvals[:, qi], 1, fit=fit) - for qi in range(len(qvals)) - ] - H = np.array(polys)[:, 0] / qvals - if debug_plot: - plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], - [p / q for p, q in zip(polys, qvals)], - x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], - reg_labels=["reg. line (H = {:.3f})".format(h) for h in H], - fname=plot_file +@overload +def mfhurst_b( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> FloatArray1D: ... + + +@overload +def mfhurst_b( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> tuple[ + FloatArray1D, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], +]: ... + + +def mfhurst_b( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> ( + FloatArray1D + | tuple[ + FloatArray1D, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], + ] +): + r"""Calculates the Generalized Hurst Exponent H_q according to A.-L. Barabási and T. Vicsek. + + Explanation of the Generalized Hurst Exponent: + The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) + be seen as a generalization of the Hurst exponent for data series with + multifractal properties. It's origins are however not directly related + to Hurst's rescaled range approach, but to the definition of self-affine + functions. + + A single-valued self-affine function h by definition satisfies the relation + + h(x) ~= lambda^(-H) h(lambda x) + + for any positive real valued lambda and some positive real valued exponent + H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent + in the literature. In other words you can view lambda as a scaling factor + or "step size". With lambda < 1 we decrease the step size and zoom into our + function. In this case lambda^(-H) becomes greater than one, meaning that + h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we + zoom out and get lambda^(-H) < 1. + + To calculate H, you can use the height-height correlation function (also + called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x + denotes the expected value over x. Here, the aforementioned self-affine + property is equivalent to c(d) ~ d^(2H). You can also think of d as a step + size. Increasing or decreasing d from 1 to some y is the same as setting + lambda = y: It increases or decreases the scale of the function by a factor + of 1/y^(-H) = y^H. Therefore the squared differences will be proportional + to y^2H. + + A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy + of exponents H_q for the qth-order correlation function with + + c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) + + With q = 1 you get a value H_1 that is closely related to the normal Hurst + exponent, but with different q you either get a constant value H_q = H_0 + independent of q, which indicates that the function has no multifractal + properties, or different H_q, which is a sign for multifractal behavior. + + T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to + financial data series and gave it the name "Generalized Hurst Exponent". + + Explanation of the Algorithm: + Curiously, I could not find any algorithmic description how to calculate + H_q in the literature. Researchers seem to just imply that you can obtain + the exponent by a line fitting algorithm in a log-log plot, but they do not + talk about the actual procedure or the required parameters. + + Essentially, we can calculate c_q(d) of a discrete evenly sampled time + series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences + [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to + the qth power and taking the mean. + + Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) + and get + + log(c_q(d)) ~ log(d) * q H_q + + So in other words if we plot log(c_q(d)) against log(d) for several d we + should get a straight line with slope q H_q. This enables us to use a + linear least squares algorithm to obtain H_q. + + Note that we consider x as a discrete variable in the range 0 <= x < N. + We can do this, because the actual sampling rate of our data series does + not alter the result. After taking the logarithm any scaling factor delta_x + would only result in an additive term since + log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope + of the line and not the intercept. + + References: + .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + + Args: + data: time series of data points (should be evenly sampled) + qvals: values of q for which H_q should be calculated (default: [1]) + dists: distances for which the height-height correlation should be calculated + (determines the x-coordinates in the log-log plot) + default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure + even spacing on the logarithmic axis + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + random_state: Seed for random number generator used for RANSAC + + Returns: + list of H_q for every q given in ``qvals``. If ``debug_data`` is True, + the return value is instead a tuple containing + + * the H_q values for each q in ``qvals`` + * a tuple of three arrays + - xvals: the logarithm of the distances used for the height-height + correlation + - yvals: the logarithm of the height-height correlations for each + distance (first dimension) and each q (second dimension) with shape + (len(dists), len(qvals)) + - poly: the coefficients of the line fit (``[slope, intercept]``) + for each q in the shape (len(qvals), 2). + """ + # transform to array if necessary + data = np.asarray(data, dtype=float_precision) + if qvals is None: + # actual default parameter would introduce shared list + # see: http://pylint-messages.wikidot.com/messages:w0102 + qvals = [1] + qvals = np.asarray(qvals, dtype=float_precision) + if dists is None: + dists = logarithmic_n(1, np.ceil(max(20, 0.02 * len(data))), 1.5) + dists = np.asarray(dists, dtype=int_precision) + min_reliable_n = 60 + if len(data) < min_reliable_n: + warnings.warn( + f"H(q) is not reliable for small time series ({len(data)} < 60)", + stacklevel=2, + ) + + def hhcorr(d: np.integer, q: np.floating) -> np.floating: + """Calculates the height-height correlation for a given distance d and q.""" + diffs = np.abs(data[:-d] - data[d:]) + diffs = diffs[np.where(diffs > 0)] + return np.mean(diffs**q) + + # calculate height-height correlations + corrvals = [hhcorr(d, q) for d in dists for q in qvals] + corrvals = np.array(corrvals, dtype=float_precision) + corrvals = corrvals.reshape(len(dists), len(qvals)) + + # line fitting + xvals = np.log(dists) + yvals = np.log(corrvals) + polys = np.array( + [ + poly_fit(xvals, yvals[:, qi], 1, fit=fit, random_state=random_state) + for qi in range(len(qvals)) + ], + dtype=float_precision, ) - if debug_data: - return H, (xvals, yvals, polys) - else: + H = polys[:, 0] / qvals + if debug_plot: + plot_reg_multiple( + np.array([xvals] * len(qvals), dtype=float_precision), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=float_precision), + np.array([p / q for p, q in zip(polys, qvals, strict=False)], dtype=float_precision), + x_label="log(x)", + y_label="$\\log(c_q(x)) / q$", + data_labels=[f"q = {q}" for q in qvals], + reg_labels=[f"reg. line (H = {h:.3f})" for h in H], + fname=plot_file, + ) + if debug_data: + return H, (xvals, yvals, polys) return H -def _genhurst(S, q): - """ - Computes the generalized hurst exponent H_q for time series S. +def _genhurst(S: FloatArray1D, q: float) -> float: + """Computes the generalized hurst exponent H_q for time series S. This function should not be used. It is only kept here to demonstrate that ``mfhurst_dm`` is implemented correctly. You can use the following call to @@ -1606,617 +2037,832 @@ def _genhurst(S, q): ## formatting and datatype fixes : Christopher Schölzel, 17/02/2019 ## """ L = len(S) - if L < 100: - warnings.warn('Data series very short!') + if L < 100: # noqa: PLR2004 + warnings.warn("Data series very short!", stacklevel=2) H = np.zeros((len(range(5, 20)), 1)) k = 0 for Tmax in range(5, 20): - - x = np.arange(1, Tmax+1, 1) + x = np.arange(1, Tmax + 1, 1) mcord = np.zeros((Tmax, 1)) - for tt in range(1, Tmax+1): - dV = S[np.arange(tt, L, tt)] - S[np.arange(tt, L, tt)-tt] - VV = S[np.arange(tt, L+tt, tt)-tt] + for tt in range(1, Tmax + 1): + dV = S[np.arange(tt, L, tt)] - S[np.arange(tt, L, tt) - tt] + VV = S[np.arange(tt, L + tt, tt) - tt] N = len(dV) + 1 - X = np.arange(1, N+1, dtype=np.float64) + X = np.arange(1, N + 1, dtype=float_precision) Y = VV - mx = np.sum(X)/N - SSxx = np.sum(X**2) - N*mx**2 - my = np.sum(Y)/N - SSxy = np.sum(np.multiply(X, Y)) - N*mx*my - cc1 = SSxy/SSxx - cc2 = my - cc1*mx + mx = np.sum(X) / N + SSxx = np.sum(X**2) - N * mx**2 + my = np.sum(Y) / N + SSxy = np.sum(np.multiply(X, Y)) - N * mx * my + cc1 = SSxy / SSxx + cc2 = my - cc1 * mx ddVd = dV - cc1 - VVVd = VV - np.multiply(cc1, np.arange(1, N+1, dtype=np.float64)) \ - - cc2 - mcord[tt-1] = np.mean(np.abs(ddVd)**q)/np.mean(np.abs(VVVd)**q) + VVVd = VV - np.multiply(cc1, np.arange(1, N + 1, dtype=float_precision)) - cc2 + mcord[tt - 1] = np.mean(np.abs(ddVd) ** q) / np.mean(np.abs(VVVd) ** q) mx = np.mean(np.log10(x)) - SSxx = np.sum(np.log10(x)**2) - Tmax*mx**2 + SSxx = np.sum(np.log10(x) ** 2) - Tmax * mx**2 my = np.mean(np.log10(mcord)) - SSxy = np.sum( - np.multiply( - np.log10(x), np.transpose(np.log10(mcord)) - ) - ) - Tmax*mx*my - H[k] = SSxy/SSxx + SSxy = ( + np.sum( + np.multiply( + np.log10(x), + np.transpose(np.log10(mcord)), + ), + ) + - Tmax * mx * my + ) + H[k] = SSxy / SSxx k = k + 1 - mH = np.mean(H)/q - - return mH - - -def _aste_line_fit(x, y): - """ - Simple linear regression with ordinary least squares - https://en.wikipedia.org/wiki/Simple_linear_regression - - NOTE: this function is left here to demonstrate the correctness of - T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same - results with a call to ``np.polyfit(x, y, 1)[::-1]``. - """ - # convert to float to avoid integer overflow problems - x = np.asarray(x, dtype=np.float64) - y = np.asarray(y, dtype=np.float64) - N = len(x) - mx = np.mean(x) - my = np.mean(y) - # calculate the variance in x - # sum((x - mx) ^ 2) = sum(x ^ 2) - 2 * sum(x * mx) + N * mx ^ 2 - # = sum(x ^ 2) - 2 * mx * sum(x) + N * mx ^ 2 - # = sum(x ^ 2) - 2 * mx * N * mx + N * mx ^ 2 - # = sum(x ^ 2) - N * mx ^ 2 - var = np.sum(x ** 2) - N * mx * mx - # corvariance of x and y - # sum((x - mx) * (y - my)) - # = sum(xy) - sum(mx * y) - sum(my * x) + N * mx * my - # = sum(xy) - mx * sum(y) - my * sum(x) + N * mx * my - # = sum(xy) - mx * my * N - my * mx * N + N * mx * my - # = sum(xy) - N * mx * my - # NOTE: T. Aste's code is a little confusing here - # X = 1:N; - # Y = S(((tt+1):tt:(L+tt))-tt)'; - # ... - # SSxy = sum(X.*Y) - N*mx*my; - # Here, Y is transposed and the multiplication for SSxy uses .* instead of *. - # This suggests that we have a matrix multiplication with (possible) - # broadcasting. If X was an array and not a range, we would have a NxN array - # as a result since size(X) = [1, N] and size(Y) = [N, 1]. Ranges behave - # differently in MATLAB and this is the only reason why we get the correct - # result here. - cov = np.sum(x * y) - N * mx * my - # calculate slope and intercept (this is correct again) - slope = cov / var - intercept = my - slope * mx - return [intercept, slope] - - -def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True, - fit="poly", debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the Generalized Hurst Exponent H_q for different q according to - the MATLAB code of Tomaso Aste - one of the authors that introduced this - measure. - - Explanation of the General Hurst Exponent: - See mfhurst_b. - - Warning: I do not recommend to use this function unless you want to reproduce - examples from Di Matteo et al.. From my experiments and a critical code - analysis it seems that mfhurst_b should provide more robust results. - - The design choices that make mfhurst_dm different than mfhurst_d are the - following: - - - By default, a linear trend is removed from the data. This can be sensible - in some application areas (such as stock market analysis), but I think - this should be an additional preprocessing step and not part of this - algorithm. - - In the calculation of the height-height correlations, the differences - (h(x) - h(x + d) are not calculated for every possible x from 0 to N-d-1, - but instead d is used as a step size for x. I see no justification for - this choice. It makes the algorithm run faster, but it also takes away - a lot of statistical robustness, especially for large values of d. - This effect can be clearly seen when setting `debug_plot` to `True`. - - The algorithm uses a linear scale for the distance values d = 1, 2, 3, - ..., tau_max. This is counter intuitive, since we later plot log(d) - against log(c_q(d)). A linear scale will have a bias towards larger - values in the logarithmic scale. A logarithmic scale for d seems to be - a more natural fit. If low values of d yield statistically unstable - results, they should simply be omitted. - - The algorithm tests multiple values for tau_max, which is the maximum - distance that will be calculated. In [mhd_1]_ the authors state that this - is done to test the robustness of the approach. However, taking the - mean of several runs with different tau_max will not produce any more - information than performing one run with the largest tau_max. Instead - it will only introduce a bias towards low values for d. - - References: - .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. - - Reference code: - .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", - url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent - - Args: - data (1d-vector of float): - input data (should be evenly sampled) - qvals (1d-vector of float) - values of q for which H_q should be calculated (default: [1]) - - Kwargs: - max_dists (1d-vector of int): - different values to test for tau_max, the maximum value for the distance - d. The resulting H_q will be a mean of all H_q calculated with tau_max - = max_dists[0], max_dists[1], ... . - detrend (boolean): - if True, a linear trend will be removed from the data before H_q will - be calculated - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - array of float: - array of mH_q for every q given in ``qvals`` where mH_q is the mean of - all H_q calculated for different max distances in max_dists. - array of float: - array of standard deviations sH_q for each mH_q returned - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. - """ - # transform to array if necessary - data = np.asarray(data) - if qvals is None: - # actual default parameter would introduce shared list - # see: http://pylint-messages.wikidot.com/messages:w0102 - qvals = [1] - if len(data) < 60: - warnings.warn( - "H(q) is not reliable for small time series ({} < 60)".format(len(data)) - ) - max_max_dist = np.max(max_dists) - hhcorr = [] - # NOTE: I don't think it's a good idea to use a linear scale for the distance - # values. Our fit is in logarithmic space, so this will place more weight on - # the higher distance. This is not bad per se, but if you think that the - # first values are unreliable, it would be better to skip them alltogether. - for dist in range(1, max_max_dist+1): - # NOTE: I don't think applying a step size to the input data is reasonable. - # I cannot find any justification for this in the papers and reduces the - # number of points that we can use to make our mean statistically stable. - step_size = dist - stepdata = data[::step_size] - if detrend: - stepdata = detrend_data(stepdata, order=1) - diffs = stepdata[1:] - stepdata[:-1] - hhcorr.append([ - np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) - for q in qvals - ]) - hhcorr = np.array(hhcorr, dtype=np.float64) - xvals = np.log(np.arange(1, max_max_dist+1)) - yvals = np.log(hhcorr) - # NOTE: Using several maximum distances seems to be a strange way to - # introduce stability, since it only places emphasis on the lower distance - # ranges and does not introduce any new information. - H = np.array([ - poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit)[0] - for qi in range(len(qvals)) - for md in max_dists - ], dtype=np.float64).reshape(len(qvals), len(max_dists)) - if debug_plot: - polys = [ - np.array(poly_fit(xvals, yvals[:, qi], 1)) / qvals[qi] - for qi in range(len(qvals)) + return float(np.mean(H) / q) + + +def _aste_line_fit( + x: NumberArrayLike1D, + y: NumberArrayLike1D, +) -> list[np.floating]: + """Simple linear regression with ordinary least squares. + + See https://en.wikipedia.org/wiki/Simple_linear_regression. + + NOTE: this function is left here to demonstrate the correctness of + T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same + results with a call to ``np.polyfit(x, y, 1)[::-1]``. + """ + # convert to float to avoid integer overflow problems + x = np.asarray(x, dtype=float_precision) + y = np.asarray(y, dtype=float_precision) + N = len(x) + mx = np.mean(x) + my = np.mean(y) + # calculate the variance in x + # sum((x - mx) ^ 2) = sum(x ^ 2) - 2 * sum(x * mx) + N * mx ^ 2 + # = sum(x ^ 2) - 2 * mx * sum(x) + N * mx ^ 2 + # = sum(x ^ 2) - 2 * mx * N * mx + N * mx ^ 2 + # = sum(x ^ 2) - N * mx ^ 2 + var = np.sum(x**2) - N * mx * mx + # corvariance of x and y + # sum((x - mx) * (y - my)) # noqa: ERA001 + # = sum(xy) - sum(mx * y) - sum(my * x) + N * mx * my + # = sum(xy) - mx * sum(y) - my * sum(x) + N * mx * my + # = sum(xy) - mx * my * N - my * mx * N + N * mx * my + # = sum(xy) - N * mx * my + # NOTE: T. Aste's code is a little confusing here + # X = 1:N; + # Y = S(((tt+1):tt:(L+tt))-tt)'; + # ... + # SSxy = sum(X.*Y) - N*mx*my; + # Here, Y is transposed and the multiplication for SSxy uses .* instead of *. + # This suggests that we have a matrix multiplication with (possible) + # broadcasting. If X was an array and not a range, we would have a NxN array + # as a result since size(X) = [1, N] and size(Y) = [N, 1]. Ranges behave + # differently in MATLAB and this is the only reason why we get the correct + # result here. + cov = np.sum(x * y) - N * mx * my + # calculate slope and intercept (this is correct again) + slope = cov / var + intercept = my - slope * mx + return [intercept, slope] + + +@overload +def mfhurst_dm( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> tuple[ + FloatArray1D, + FloatArray1D, +]: ... + + +@overload +def mfhurst_dm( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> tuple[ + FloatArray1D, + FloatArray1D, + tuple[ + FloatArray1D, + FloatArray2D, + FloatArray2D, + ], +]: ... + + +def mfhurst_dm( + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> ( + tuple[ + FloatArray1D, + FloatArray1D, ] - plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], - polys, - x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], - reg_labels=["reg. line (H = {:.3f})".format(h) for h in H[:, -1] / qvals], - fname=plot_file - ) - mH = np.mean(H, axis=1) / qvals - sH = np.std(H, axis=1) / qvals - if debug_data: - return [mH, sH, (xvals, yvals, polys)] - else: - return [mH, sH] - - -def corr_dim(data, emb_dim, lag=1, rvals=None, dist=rowwise_euclidean, - fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the correlation dimension with the Grassberger-Procaccia algorithm - - Explanation of correlation dimension: - The correlation dimension is a characteristic measure that can be used - to describe the geometry of chaotic attractors. It is defined using the - correlation sum C(r) which is the fraction of pairs of points X_i in the - phase space whose distance is smaller than r. - - If the relation between C(r) and r can be described by the power law - - C(r) ~ r^D - - then D is called the correlation dimension of the system. - - In a d-dimensional system, the maximum value for D is d. This value is - obtained for systems that expand uniformly in each dimension with time. - The lowest possible value is 0 for a system with constant C(r) (i.e. a - system that visits just one point in the phase space). Generally if D is - lower than d and the system has an attractor, this attractor is called - "strange" and D is a measure of this "strangeness". - - Explanation of the algorithm: - The Grassberger-Procaccia algorithm calculates C(r) for a range of - different r and then fits a straight line into the plot of log(C(r)) - versus log(r). - - This version of the algorithm is created for one-dimensional (scalar) time - series. Therefore, before calculating C(r), a delay embedding of the time - series is performed to yield emb_dim dimensional vectors - Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing - a higher value for emb_dim allows to reconstruct higher dimensional dynamics - and avoids "systematic errors due to corrections to scaling". Choosing a - higher value for lag allows to avoid overestimating correlation because - X_i ~= X_i+1, but it should also not be set too high to not underestimate - correlation due to exponential divergence of trajectories in chaotic systems. - - References: - .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange - attractors,” Physical review letters, vol. 50, no. 5, p. 346, - 1983. - .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of - strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” - Scholarpedia, vol. 2, no. 5, p. 3043. - urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm - - Reference Code: - .. [cd_a] "corrDim" function in R package "fractal", - url: https://cran.r-project.org/web/packages/fractal/fractal.pdf - .. [cd_b] Peng Yuehua, "Correlation dimension", - url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension - - Args: - data (array-like of float): - time series of data points - emb_dim (int): - embedding dimension - Kwargs: - rvals (iterable of float): - list of values for to use for r - (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) - dist (function (2d-array, 1d-array) -> 1d-array): - row-wise difference function - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float: - correlation dimension as slope of the line fitted to log(r) vs log(C(r)) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(rvals, csums, poly)`` where ``rvals`` are the values used for log(r), - ``csums`` are the corresponding log(C(r)) and ``poly`` are the line - coefficients (``[slope, intercept]``) - """ - # TODO determine lag in units of time instead of number of datapoints - data = np.asarray(data) - - # TODO what are good values for r? - # TODO do this for multiple values of emb_dim? - if rvals is None: - sd = np.std(data, ddof=1) - rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) - orbit = delay_embedding(data, emb_dim, lag=lag) - n = len(orbit) - dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64) - for i in range(len(orbit)): - # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1 - # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches - # however, since both [cd_2] and [cd_3] specify to only compare i with j != i - # or j > i respectively, it is safe to assume that this was an oversight in - # [cd_1] - d = dist(orbit[i+1:], orbit[i]) - dists[i+1:,i] = d # fill column i - dists[i,i+1:] = d # fill row i - csums = [] - for r in rvals: - # NOTE: The [cd_1] and [cd_2] both use the factor 1/N^2 here. - # However, since we only use these values to fit a line in a log-log plot - # any multiplicative constant doesn't change the result since it will - # only result in an offset on the y-axis. Also, [cd_3] has a point here - # in that if we exclude self-matches in the numerator, it makes sense to - # also exclude self-matches from the denominator. - s = 1.0 / (n * (n - 1)) * np.sum(dists <= r) - csums.append(s) - csums = np.array(csums) - # filter zeros from csums - nonzero = np.where(csums != 0) - rvals = np.array(rvals)[nonzero] - csums = csums[nonzero] - if len(csums) == 0: - # all sums are zero => we cannot fit a line - poly = [np.nan, np.nan] - else: - poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) - if debug_plot: - plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", - fname=plot_file) - if debug_data: - return (poly[0], (np.log(rvals), np.log(csums), poly)) - else: + | tuple[ + FloatArray1D, + FloatArray1D, + tuple[ + FloatArray1D, + FloatArray2D, + FloatArray2D, + ], + ] +): + """Calculates the Generalized Hurst Exponent H_q according to Di Mattheo and Aste. + + This implementation is a port of the MATLAB code of Tomaso Aste. + + Explanation of the General Hurst Exponent: + See mfhurst_b. + + Warning: I do not recommend to use this function unless you want to reproduce + examples from Di Matteo et al.. From my experiments and a critical code + analysis it seems that mfhurst_b should provide more robust results. + + The design choices that make mfhurst_dm different than mfhurst_d are the + following: + + - By default, a linear trend is removed from the data. This can be sensible + in some application areas (such as stock market analysis), but I think + this should be an additional preprocessing step and not part of this + algorithm. + - In the calculation of the height-height correlations, the differences + (h(x) - h(x + d) are not calculated for every possible x from 0 to N-d-1, + but instead d is used as a step size for x. I see no justification for + this choice. It makes the algorithm run faster, but it also takes away + a lot of statistical robustness, especially for large values of d. + This effect can be clearly seen when setting `debug_plot` to `True`. + - The algorithm uses a linear scale for the distance values d = 1, 2, 3, + ..., tau_max. This is counter intuitive, since we later plot log(d) + against log(c_q(d)). A linear scale will have a bias towards larger + values in the logarithmic scale. A logarithmic scale for d seems to be + a more natural fit. If low values of d yield statistically unstable + results, they should simply be omitted. + - The algorithm tests multiple values for tau_max, which is the maximum + distance that will be calculated. In [mhd_1]_ the authors state that this + is done to test the robustness of the approach. However, taking the + mean of several runs with different tau_max will not produce any more + information than performing one run with the largest tau_max. Instead + it will only introduce a bias towards low values for d. + + References: + .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. + + Reference code: + .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", + url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent + + Args: + data: input data (should be evenly sampled) + qvals: values of q for which H_q should be calculated (default: [1]) + max_dists: different values to test for tau_max, the maximum value for the distance + d. The resulting H_q will be a mean of all H_q calculated with tau_max + = max_dists[0], max_dists[1], ... . + detrend: if True, a linear trend will be removed from the data before H_q will + be calculated + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + random_state: Seed for random number generator used for RANSAC + + Returns: + tuple containing + + - mH: array of mH_q for every q given in ``qvals`` where mH_q is the mean of + all H_q calculated for different max distances in max_dists. + - sH: same as mH, but calculating the standard deviation instead of the mean. + + If ``debug_data`` is True, the return value is instead a tuple containing + + - mH: array of mean H_q for each q in ``qvals`` + - sH: array of standard deviation of H_q for each q in ``qvals`` + - debug_data: a tuple of three arrays + - xvals: the logarithm of the distances used for the height-height + correlation + - yvals: the logarithm of the height-height correlations for each + distance (first dimension) and each q (second dimension) with shape + (len(dists), len(qvals)) + - poly: the coefficients of the line fit (``[slope, intercept]``) + for each q in the shape (len(qvals), 2). + """ + # transform to array if necessary + data = np.asarray(data, dtype=float_precision) + if qvals is None: + # actual default parameter would introduce shared list + # see: http://pylint-messages.wikidot.com/messages:w0102 + qvals = [1] + qvals = np.asarray(qvals, dtype=float_precision) + if max_dists is None: + max_dists = range(5, 20) + max_dists = np.asarray(max_dists, dtype=int_precision) + min_reliable_n = 60 + if len(data) < min_reliable_n: + warnings.warn( + f"H(q) is not reliable for small time series ({len(data)} < 60)", + stacklevel=2, + ) + max_max_dist = np.max(max_dists) + hhcorr = [] + # NOTE: I don't think it's a good idea to use a linear scale for the distance + # values. Our fit is in logarithmic space, so this will place more weight on + # the higher distance. This is not bad per se, but if you think that the + # first values are unreliable, it would be better to skip them alltogether. + for dist in range(1, max_max_dist + 1): + # NOTE: I don't think applying a step size to the input data is reasonable. + # I cannot find any justification for this in the papers and reduces the + # number of points that we can use to make our mean statistically stable. + step_size = dist + stepdata = data[::step_size] + if detrend: + stepdata = detrend_data(stepdata, order=1, random_state=random_state) + diffs = stepdata[1:] - stepdata[:-1] + hhcorr.append([np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) for q in qvals]) + hhcorr = np.array(hhcorr, dtype=float_precision) + xvals = np.log(np.arange(1, max_max_dist + 1)) + yvals = np.log(hhcorr) + # NOTE: Using several maximum distances seems to be a strange way to + # introduce stability, since it only places emphasis on the lower distance + # ranges and does not introduce any new information. + H = np.array( + [ + poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit, random_state=random_state)[0] + for qi in range(len(qvals)) + for md in max_dists + ], + dtype=float_precision, + ).reshape(len(qvals), len(max_dists)) + if debug_plot: + polys = np.array( + [poly_fit(xvals, yvals[:, qi], 1) / qvals[qi] for qi in range(len(qvals))], + dtype=float_precision, + ) + plot_reg_multiple( + np.array([xvals] * len(qvals), dtype=float_precision), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=float_precision), + polys, + x_label="log(x)", + y_label="$\\log(c_q(x)) / q$", + data_labels=[f"q = {q}" for q in qvals], + reg_labels=[f"reg. line (H = {h:.3f})" for h in H[:, -1] / qvals], + fname=plot_file, + ) + mH = np.mean(H, axis=1) / qvals + sH = np.std(H, axis=1) / qvals + if debug_data: + return (mH, sH, (xvals, yvals, polys)) + return (mH, sH) + + +@overload +def corr_dim( + data: NumberArrayLike1D, + emb_dim: int = 2, + lag: int = 1, + rvals: FloatArrayLike1D | None = None, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> float: ... + + +@overload +def corr_dim( + data: NumberArrayLike1D, + emb_dim: int = 2, + lag: int = 1, + rvals: FloatArrayLike1D | None = None, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], +]: ... + + +def corr_dim( + data: NumberArrayLike1D, + emb_dim: int = 2, + lag: int = 1, + rvals: FloatArrayLike1D | None = None, + dist: Callable[ + [ + FloatArray2D, + FloatArray1D, + ], + FloatArray1D, + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> ( + float + | tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], + ] +): + """Calculates the correlation dimension with the Grassberger-Procaccia algorithm. + + Explanation of correlation dimension: + The correlation dimension is a characteristic measure that can be used + to describe the geometry of chaotic attractors. It is defined using the + correlation sum C(r) which is the fraction of pairs of points X_i in the + phase space whose distance is smaller than r. + + If the relation between C(r) and r can be described by the power law + + C(r) ~ r^D + + then D is called the correlation dimension of the system. + + In a d-dimensional system, the maximum value for D is d. This value is + obtained for systems that expand uniformly in each dimension with time. + The lowest possible value is 0 for a system with constant C(r) (i.e. a + system that visits just one point in the phase space). Generally if D is + lower than d and the system has an attractor, this attractor is called + "strange" and D is a measure of this "strangeness". + + Explanation of the algorithm: + The Grassberger-Procaccia algorithm calculates C(r) for a range of + different r and then fits a straight line into the plot of log(C(r)) + versus log(r). + + This version of the algorithm is created for one-dimensional (scalar) time + series. Therefore, before calculating C(r), a delay embedding of the time + series is performed to yield emb_dim dimensional vectors + Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing + a higher value for emb_dim allows to reconstruct higher dimensional dynamics + and avoids "systematic errors due to corrections to scaling". Choosing a + higher value for lag allows to avoid overestimating correlation because + X_i ~= X_i+1, but it should also not be set too high to not underestimate + correlation due to exponential divergence of trajectories in chaotic systems. + + References: + .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange + attractors,” Physical review letters, vol. 50, no. 5, p. 346, + 1983. + .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of + strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” + Scholarpedia, vol. 2, no. 5, p. 3043. + urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm + + Reference Code: + .. [cd_a] "corrDim" function in R package "fractal", + url: https://cran.r-project.org/web/packages/fractal/fractal.pdf + .. [cd_b] Peng Yuehua, "Correlation dimension", + url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension + + Args: + data: time series of data points + emb_dim: embedding dimension + lag: the distance between two successive elements in the embedding vectors + (given in number of datapoints) + rvals: list of values for to use for r + (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) + dist: row-wise difference function + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + random_state: Seed for random number generator used for RANSAC + + Returns: + correlation dimension as slope of the line fitted to log(r) vs log(C(r)) + + If ``debug_data`` is True, the return value is instead a tuple containing + + - cd: the correlation dimension + - debug_data: tuple containing + - rvals: the values used for log(r) + - csums: the corresponding log(C(r)) + - poly: the line coefficients (``[slope, intercept]``) + """ + # TODO: determine lag in units of time instead of number of datapoints + data = np.asarray(data, dtype=float_precision) + + # TODO: what are good values for r? + # TODO: do this for multiple values of emb_dim? + if rvals is None: + sd = float(np.std(data, ddof=1)) + rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) + rvals = np.asarray(rvals, dtype=float_precision) + orbit = delay_embedding(data, emb_dim, lag=lag) + n = len(orbit) + dists = np.zeros((len(orbit), len(orbit)), dtype=float_precision) + for i in range(len(orbit)): + # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1 + # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches + # however, since both [cd_2] and [cd_3] specify to only compare i with j != i + # or j > i respectively, it is safe to assume that this was an oversight in + # [cd_1] # noqa: ERA001 + d = dist(orbit[i + 1 :], orbit[i]) + dists[i + 1 :, i] = d # fill column i + dists[i, i + 1 :] = d # fill row i + csums = [] + for r in rvals: + # NOTE: The [cd_1] and [cd_2] both use the factor 1/N^2 here. + # However, since we only use these values to fit a line in a log-log plot + # any multiplicative constant doesn't change the result since it will + # only result in an offset on the y-axis. Also, [cd_3] has a point here + # in that if we exclude self-matches in the numerator, it makes sense to + # also exclude self-matches from the denominator. + s = 1.0 / (n * (n - 1)) * np.sum(dists <= r) + csums.append(s) + csums = np.array(csums, dtype=float_precision) + # filter zeros from csums + nonzero = np.where(csums != 0) + rvals = rvals[nonzero] + csums = csums[nonzero] + if len(csums) == 0: + # all sums are zero => we cannot fit a line + poly = np.array([np.nan, np.nan], dtype=float_precision) + else: + poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit, random_state=random_state) + if debug_plot: + plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file) + if debug_data: + return (poly[0], (np.log(rvals), np.log(csums), poly)) return poly[0] -def detrend_data(data, order=1, fit="poly"): - """ - Removes a trend of given order from the data. - """ - # TODO also use this function in dfa - xvals = np.arange(len(data)) - trend = poly_fit(xvals, data, order, fit=fit) - detrended = data - np.polyval(trend, xvals) - return detrended - - -def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", - fit_exp="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """ - Performs a detrended fluctuation analysis (DFA) on the given data - - Recommendations for parameter settings by Hardstone et al.: - * nvals should be equally spaced on a logarithmic scale so that each window - scale hase the same weight - * min(nvals) < 4 does not make much sense as fitting a polynomial (even if - it is only of order 1) to 3 or less data points is very prone to errors. - * max(nvals) > len(data) / 10 does not make much sense as we will then have - less than 10 windows to calculate the average fluctuation - * use overlap=True to obtain more windows and therefore better statistics - (at an increased computational cost) - - Explanation of DFA: - Detrended fluctuation analysis, much like the Hurst exponent, is used to - find long-term statistical dependencies in time series. However, while the - Hurst exponent will indicate long-term correlations for any non-stationary - process (i.e. a stochastic process whose probability distribution changes - when shifted in time, such as a random walk whose mean changes over time), - DFA was designed to distinguish between correlations that are purely an - artifact of non-stationarity and those that show inherent long-term - behavior of the studied system. - - Mathematically, the long-term correlations that we are interested in can - be characterized using the autocorrelation function C(s). For a time series - (x_i) with i = 1, ..., N it is defined as follows: - - C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) - - with y_i = x_i - mean(x). If there are no correlations at all, C(s) would - be zero for s > 0. For short-range correlations, C(s) will decline - exponentially, but for long-term correlations the decline follows a power - law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. - - Due to noise and underlying trends, calculating C(s) directly is usually not - feasible. The main idea of DFA is therefore to remove trends up to a given - order from the input data and analyze the remaining fluctuations. Trends - in this sense are smooth signals with monotonous or slowly oscillating - behavior that are caused by external effects and not the dynamical system - under study. - - To get a hold of these trends, the first step is to calculate the "profile" - of our time series as the cumulative sum of deviations from the mean, - effectively integrating our data. This both smoothes out measurement noise - and makes it easier to distinguish the fractal properties of bounded time - series (i.e. time series whose values cannot grow or shrink beyond certain - bounds such as most biological or physical signals) by applying random walk - theory (see [dfa_3]_ and [dfa_4]_). - - y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). - - After that, we split Y(i) into (usually non-overlapping) windows of length - n to calculate local trends at this given scale. The ith window of this - size has the form - - W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] - - The local trends are then removed for each window separately by fitting a - polynomial p_(n,i) to the window W_(n,i) and then calculating - W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). - - This leaves us with the deviations from the trend - the "fluctuations" - - that we are interested in. To quantify them, we take the root mean square - of these fluctuations. It is important to note that we have to sum up all - individual fluctuations across all windows and divide by the total number - of fluctuations here before finally taking the root as last step. Some - implementations apply another root per window, which skews the result. - - The resulting fluctuation F(n) is then only dependent on the window size n, - the scale at which we observe our data. It behaves similar to the - autocorrelation function in that it follows a power-law for long-term - correlations: - - F(n) ~ n^alpha - - Where alpha is the Hurst parameter, which we can obtain from fitting a line - into the plot of log(n) versus log(F(n)) and taking the slope. - - The result can be interpreted as follows: For alpha < 1 the underlying - process is stationary and can be modelled as fractional Gaussian noise with - H = alpha. This means for alpha = 0.5 we have no long-term correlation or - "memory", for 0.5 < alpha < 1 we have positive long-term correlations and - for alpha < 0.5 the long-term correlations are negative. - - For alpha > 1 the underlying process is non-stationary and can be modeled - as fractional Brownian motion with H = alpha - 1. - - References: - .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, - H. E. Stanley, and A. L. Goldberger, “Mosaic organization of - DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. - .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. - Havlin, and A. Bunde, “Detecting long-range correlations with - detrended fluctuation analysis,” Physica A: Statistical - Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, - Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. - .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal - mechanisms in neuronal control: human heartbeat and gait - dynamics in health and disease,” in Self-Organized Biological - Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., - Cambridge University Press, 2000, pp. 66–96. - doi: 10.1017/CBO9780511535338.006. - .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, - “Comparison of detrending methods for fluctuation analysis,” - Physica A: Statistical Mechanics and its Applications, vol. 387, - no. 21, pp. 5080–5090, Sep. 2008, - doi: 10.1016/j.physa.2008.04.023. - .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, - V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, - “Detrended fluctuation analysis: A scale-free view on neuronal - oscillations,” Frontiers in Physiology, vol. 30, 2012. - - Reference code: - .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", - url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html - .. [dfa_b] JE Mietus, "dfa", - url: https://www.physionet.org/physiotools/dfa/dfa-1.htm - .. [dfa_c] "DFA" function in R package "fractal" - - Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - subseries sizes at which to calculate fluctuation - (default: logarithmic_n(4, 0.1*len(data), 1.2)) - overlap (boolean): - if True, the windows W_(n,i) will have a 50% overlap, - otherwise non-overlapping windows will be used - order (int): - (polynomial) order of trend to remove - fit_trend (str): - the fitting method to use for fitting the trends, either 'poly' - for normal least squares polynomial fitting or 'RANSAC' for - RANSAC-fitting which is more robust to outliers but also tends to - lead to unstable results - fit_exp (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - Returns: - float: - the estimate alpha for the Hurst parameter (alpha < 1: stationary - process similar to fractional Gaussian noise with H = alpha, - alpha > 1: non-stationary process similar to fractional Brownian - motion with H = alpha - 1) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, fluctuations, poly)`` where ``nvals`` are the values used for - log(n), ``fluctuations`` are the corresponding log(std(X,n)) and ``poly`` - are the line coefficients (``[slope, intercept]``) - """ - data = np.asarray(data) - total_N = len(data) - if nvals is None: - if total_N > 70: - nvals = logarithmic_n(4, 0.1 * total_N, 1.2) - elif total_N > 10: - nvals = [4, 5, 6, 7, 8, 9] - else: - nvals = [total_N-2, total_N-1] - msg = "choosing nvals = {} , DFA with less than ten data points is " \ - + "extremely unreliable" - warnings.warn(msg.format(nvals), RuntimeWarning) - if len(nvals) < 2: - raise ValueError("at least two nvals are needed") - if np.min(nvals) < 2: - raise ValueError("nvals must be at least two") - if np.max(nvals) >= total_N: - raise ValueError("nvals cannot be larger than the input size") - # create the signal profile - # (cumulative sum of deviations from the mean => "walk") - walk = np.cumsum(data - np.mean(data)) - fluctuations = [] - for n in nvals: - assert n >= 2 - # subdivide data into chunks of size n - if overlap: - # step size n/2 instead of n - d = np.array([walk[i:i + n] for i in range(0, len(walk) - n, n // 2)]) +def detrend_data( + data: FloatArray1D, + order: int = 1, + fit: FittingMethod = "poly", + random_state: int | None = None, +) -> FloatArray1D: + """Removes a trend of given order from the data.""" + # TODO: also use this function in dfa + xvals = np.arange(len(data)) + trend = poly_fit(xvals, data, order, fit=fit, random_state=random_state) + return data - np.polyval(trend, xvals) + + +@overload +def dfa( + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> float: ... + + +@overload +def dfa( + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], +]: ... + + +def dfa( # noqa: C901, PLR0912, PLR0915 + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + random_state: int | None = None, +) -> ( + float + | tuple[ + float, + tuple[ + FloatArray1D, + FloatArray1D, + FloatArray1D, + ], + ] +): + """Performs a detrended fluctuation analysis (DFA) on the given data. + + Recommendations for parameter settings by Hardstone et al.: + * nvals should be equally spaced on a logarithmic scale so that each window + scale hase the same weight + * min(nvals) < 4 does not make much sense as fitting a polynomial (even if + it is only of order 1) to 3 or less data points is very prone to errors. + * max(nvals) > len(data) / 10 does not make much sense as we will then have + less than 10 windows to calculate the average fluctuation + * use overlap=True to obtain more windows and therefore better statistics + (at an increased computational cost) + + Explanation of DFA: + Detrended fluctuation analysis, much like the Hurst exponent, is used to + find long-term statistical dependencies in time series. However, while the + Hurst exponent will indicate long-term correlations for any non-stationary + process (i.e. a stochastic process whose probability distribution changes + when shifted in time, such as a random walk whose mean changes over time), + DFA was designed to distinguish between correlations that are purely an + artifact of non-stationarity and those that show inherent long-term + behavior of the studied system. + + Mathematically, the long-term correlations that we are interested in can + be characterized using the autocorrelation function C(s). For a time series + (x_i) with i = 1, ..., N it is defined as follows: + + C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) + + with y_i = x_i - mean(x). If there are no correlations at all, C(s) would + be zero for s > 0. For short-range correlations, C(s) will decline + exponentially, but for long-term correlations the decline follows a power + law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. + + Due to noise and underlying trends, calculating C(s) directly is usually not + feasible. The main idea of DFA is therefore to remove trends up to a given + order from the input data and analyze the remaining fluctuations. Trends + in this sense are smooth signals with monotonous or slowly oscillating + behavior that are caused by external effects and not the dynamical system + under study. + + To get a hold of these trends, the first step is to calculate the "profile" + of our time series as the cumulative sum of deviations from the mean, + effectively integrating our data. This both smoothes out measurement noise + and makes it easier to distinguish the fractal properties of bounded time + series (i.e. time series whose values cannot grow or shrink beyond certain + bounds such as most biological or physical signals) by applying random walk + theory (see [dfa_3]_ and [dfa_4]_). + + y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). + + After that, we split Y(i) into (usually non-overlapping) windows of length + n to calculate local trends at this given scale. The ith window of this + size has the form + + W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] + + The local trends are then removed for each window separately by fitting a + polynomial p_(n,i) to the window W_(n,i) and then calculating + W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). + + This leaves us with the deviations from the trend - the "fluctuations" - + that we are interested in. To quantify them, we take the root mean square + of these fluctuations. It is important to note that we have to sum up all + individual fluctuations across all windows and divide by the total number + of fluctuations here before finally taking the root as last step. Some + implementations apply another root per window, which skews the result. + + The resulting fluctuation F(n) is then only dependent on the window size n, + the scale at which we observe our data. It behaves similar to the + autocorrelation function in that it follows a power-law for long-term + correlations: + + F(n) ~ n^alpha + + Where alpha is the Hurst parameter, which we can obtain from fitting a line + into the plot of log(n) versus log(F(n)) and taking the slope. + + The result can be interpreted as follows: For alpha < 1 the underlying + process is stationary and can be modelled as fractional Gaussian noise with + H = alpha. This means for alpha = 0.5 we have no long-term correlation or + "memory", for 0.5 < alpha < 1 we have positive long-term correlations and + for alpha < 0.5 the long-term correlations are negative. + + For alpha > 1 the underlying process is non-stationary and can be modeled + as fractional Brownian motion with H = alpha - 1. + + References: + .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, + H. E. Stanley, and A. L. Goldberger, “Mosaic organization of + DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. + .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. + Havlin, and A. Bunde, “Detecting long-range correlations with + detrended fluctuation analysis,” Physica A: Statistical + Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, + Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. + .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal + mechanisms in neuronal control: human heartbeat and gait + dynamics in health and disease,” in Self-Organized Biological + Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., + Cambridge University Press, 2000, pp. 66–96. + doi: 10.1017/CBO9780511535338.006. + .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, + “Comparison of detrending methods for fluctuation analysis,” + Physica A: Statistical Mechanics and its Applications, vol. 387, + no. 21, pp. 5080–5090, Sep. 2008, + doi: 10.1016/j.physa.2008.04.023. + .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, + V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, + “Detrended fluctuation analysis: A scale-free view on neuronal + oscillations,” Frontiers in Physiology, vol. 30, 2012. + + Reference code: + .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", + url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html + .. [dfa_b] JE Mietus, "dfa", + url: https://www.physionet.org/physiotools/dfa/dfa-1.htm + .. [dfa_c] "DFA" function in R package "fractal" + + Args: + data: time series + nvals: subseries sizes at which to calculate fluctuation + (default: logarithmic_n(4, 0.1*len(data), 1.2)) + overlap: if True, the windows W_(n,i) will have a 50% overlap, + otherwise non-overlapping windows will be used + order: (polynomial) order of trend to remove + fit_trend: the fitting method to use for fitting the trends, either 'poly' + for normal least squares polynomial fitting or 'RANSAC' for + RANSAC-fitting which is more robust to outliers but also tends to + lead to unstable results + fit_exp: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + random_state: Seed for random number generator used for RANSAC + Returns: + the estimate alpha for the Hurst parameter (alpha < 1: stationary + process similar to fractional Gaussian noise with H = alpha, + alpha > 1: non-stationary process similar to fractional Brownian + motion with H = alpha - 1) + + If ``debug_data`` is set to ``True``, the return value is instead a tuple containing + + - H: Hurst parameter + - nvals: the values used for log(n) + - fluctuations: the corresponding log(std(X,n)) + - poly: the line coefficients (``[slope, intercept]``) + """ + data = np.asarray(data, dtype=float_precision) + total_N = len(data) + if nvals is None: + min_n_for_log_scale = 70 + min_n = 10 + if total_N > min_n_for_log_scale: + nvals = logarithmic_n(4, np.floor(0.1 * total_N), 1.2) + elif total_N > min_n: + nvals = [4, 5, 6, 7, 8, 9] + else: + nvals = [total_N - 2, total_N - 1] + msg = "choosing nvals = {} , DFA with less than ten data points is extremely unreliable" + warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) + nvals = np.asarray(nvals, dtype=int_precision) + min_number_of_nvals = 2 + min_nval = 2 + if nvals.shape[0] < min_number_of_nvals: + msg = "at least two nvals are needed" + raise ValueError(msg) + if np.min(nvals) < min_nval: + msg = "nvals must be at least two" + raise ValueError(msg) + if np.max(nvals) >= total_N: + msg = "nvals cannot be larger than the input size" + raise ValueError(msg) + # create the signal profile + # (cumulative sum of deviations from the mean => "walk") + walk = np.cumsum(data - np.mean(data)) + fluctuations = [] + for n in nvals: + assert n >= min_nval + # subdivide data into chunks of size n + if overlap: + # step size n/2 instead of n + d = np.array( + [walk[i : i + n] for i in range(0, len(walk) - n, n // 2)], dtype=float_precision + ) + else: + # non-overlapping windows => we can simply do a reshape + d = walk[: total_N - (total_N % n)] + d = d.reshape((total_N // n, n)) + # calculate local trends as polynomes + x = np.arange(n) + tpoly = [ + poly_fit(x, d[i], order, fit=fit_trend, random_state=random_state) + for i in range(len(d)) + ] + tpoly = np.array(tpoly, dtype=float_precision) + trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))], dtype=float_precision) + # calculate mean-square differences for each walk in d around trend + flucs = np.sum((d - trend) ** 2, axis=1) / n + # take another mean across all walks and finally take the square root of that + # NOTE: To map this to the formula in Peng1995, observe that this simplifies + # to np.sqrt(np.sum((d - trend) ** 2) / total_N) if we have non-overlapping + # windows and the last window matches the end of the data perfectly. + f_n = np.sqrt(np.sum(flucs) / len(flucs)) + fluctuations.append(f_n) + fluctuations = np.array(fluctuations, dtype=float_precision) + # filter zeros from fluctuations + nonzero = np.where(fluctuations != 0) + nvals = nvals[nonzero] + fluctuations = fluctuations[nonzero] + if len(fluctuations) == 0: + # all fluctuations are zero => we cannot fit a line + poly = np.array([np.nan, np.nan], dtype=float_precision) else: - # non-overlapping windows => we can simply do a reshape - d = walk[:total_N - (total_N % n)] - d = d.reshape((total_N // n, n)) - # calculate local trends as polynomes - x = np.arange(n) - tpoly = [poly_fit(x, d[i], order, fit=fit_trend) - for i in range(len(d))] - tpoly = np.array(tpoly) - trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))]) - # calculate mean-square differences for each walk in d around trend - flucs = np.sum((d - trend) ** 2, axis=1) / n - # take another mean across all walks and finally take the square root of that - # NOTE: To map this to the formula in Peng1995, observe that this simplifies - # to np.sqrt(np.sum((d - trend) ** 2) / total_N) if we have non-overlapping - # windows and the last window matches the end of the data perfectly. - f_n = np.sqrt(np.sum(flucs) / len(flucs)) - fluctuations.append(f_n) - fluctuations = np.array(fluctuations) - # filter zeros from fluctuations - nonzero = np.where(fluctuations != 0) - nvals = np.array(nvals)[nonzero] - fluctuations = fluctuations[nonzero] - if len(fluctuations) == 0: - # all fluctuations are zero => we cannot fit a line - poly = [np.nan, np.nan] - else: - poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, - fit=fit_exp) - if debug_plot: - plot_reg(np.log(nvals), np.log(fluctuations), poly, "log(n)", "std(X,n)", - fname=plot_file) - if debug_data: - return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) - else: + poly = poly_fit( + np.log(nvals), + np.log(fluctuations), + 1, + fit=fit_exp, + random_state=random_state, + ) + if debug_plot: + plot_reg( + np.log(nvals), + np.log(fluctuations), + poly, + "log(n)", + "std(X,n)", + fname=plot_file, + ) + if debug_data: + return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) return poly[0] diff --git a/nolds/test_measures.py b/nolds/test_measures.py index b56c3e6..b2b30b0 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -1,691 +1,990 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) -import numpy as np +"""Unit tests for main measures of measures.""" + +from __future__ import annotations -# import internal module to test helping functions -from nolds import measures as nolds -from nolds import datasets import unittest import warnings +from typing import TYPE_CHECKING, Any, Protocol, cast + +import numpy as np +from numpy.testing import assert_almost_equal + +if TYPE_CHECKING: + from numpy.typing import ArrayLike, DTypeLike -# TODO add tests for mfhurst_b and mfhurst_dm +from nolds import datasets, measures -# TODO add more tests using fgn and fbm for hurst_rs and dfa +# TODO: add tests for mfhurst_b and mfhurst_dm -# TODO split up tests into smaller units => one hypothesis = one test +# TODO: add more tests using fgn and fbm for hurst_rs and dfa try: - from scipy.stats import levy_stable - SCIPY_AVAILABLE = True + from scipy.stats import levy_stable + + SCIPY_AVAILABLE = True except ImportError: - SCIPY_AVAILABLE = False + SCIPY_AVAILABLE = False class TestNoldsHelperFunctions(unittest.TestCase): - """ - Tests for internal helper functions that are not part of the public API - """ - def assert_array_equals(self, expected, actual, print_arrays=False): - if print_arrays: - print(actual) - print("==") - print(expected) - print() - self.assertTrue(np.all(actual == expected)) - - def test_delay_embed_lag2(self): - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=2) - expected = np.array([ - [0, 2, 4, 6], - [1, 3, 5, 7], - [2, 4, 6, 8], - [3, 5, 7, 9] - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed(self): - data = np.arange(6, dtype="float32") - embedded = nolds.delay_embedding(data, 4) - expected = np.array([ - [0, 1, 2, 3], - [1, 2, 3, 4], - [2, 3, 4, 5] - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed_lag3(self): - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=3) - expected = np.array([ - [0, 3, 6, 9] - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed_empty(self): - data = np.arange(10, dtype="float32") - try: - embedded = nolds.delay_embedding(data, 11) - msg = "embedding array of size 10 with embedding dimension 11 " \ - + "should fail, got {} instead" - self.fail(msg.format(embedded)) - except ValueError: - pass - data = np.arange(10, dtype="float32") - try: - embedded = nolds.delay_embedding(data, 4, lag=4) - msg = "embedding array of size 10 with embedding dimension 4 and " \ - + "lag 4 should fail, got {} instead" - self.fail(msg.format(embedded)) - except ValueError: - pass + """Tests for internal helper functions that are not part of the public API.""" + + def assert_array_equal( + self, + expected: ArrayLike, + actual: ArrayLike, + dtype: DTypeLike = np.float64, + ) -> None: + """Test that two arrays are exactly equal. + + Args: + expected: The expected result + actual: The actual result + dtype: dtype of the arrays to compare + """ + expected = np.asarray(expected, dtype=dtype) + actual = np.asarray(actual, dtype=dtype) + diff_indices = np.array((actual != expected).nonzero()).transpose() + first_diff = diff_indices[0] if diff_indices.shape[0] > 0 else None + first_diff_msg = f"{expected[first_diff]} != {actual[first_diff]} at {first_diff}" + msg = ( + f"Arrays differ!\n\nExpected:\n{expected}\n\nActual:\n{actual}" + f"\n\nFirst difference:\n{first_diff_msg}" + ) + assert np.all(actual == expected), msg + + def test_delay_embed_lag2(self) -> None: + """Hypothesis: Setting a lag of 2 skips every second element in orbit vectors.""" + data = np.arange(10, dtype=np.float64) + embedded = measures.delay_embedding(data, 4, lag=2) + expected = np.array( + [ + [0, 2, 4, 6], + [1, 3, 5, 7], + [2, 4, 6, 8], + [3, 5, 7, 9], + ], + dtype=np.float64, + ) + self.assert_array_equal(expected, embedded) + + def test_delay_embed(self) -> None: + """Hypothesis: Default settings produce consecutive slices of same length.""" + data = np.arange(6, dtype=np.float64) + embedded = measures.delay_embedding(data, 4) + expected = np.array( + [ + [0, 1, 2, 3], + [1, 2, 3, 4], + [2, 3, 4, 5], + ], + dtype=np.float64, + ) + self.assert_array_equal(expected, embedded) + + def test_delay_embed_lag3(self) -> None: + """Hypothesis: Setting a lag of 3 only takes every third element.""" + data = np.arange(10, dtype=np.float64) + embedded = measures.delay_embedding(data, 4, lag=3) + expected = np.array( + [ + [0, 3, 6, 9], + ], + dtype=np.float64, + ) + self.assert_array_equal(expected, embedded) + + def test_delay_embed_empty(self) -> None: + """Hypothesis: An error is raised when settings would lead to an empty orbit vector list.""" + data = np.arange(10, dtype=np.float64) + try: + embedded = measures.delay_embedding(data, 11) + msg = ( + "embedding array of size 10 with embedding dimension 11 should fail, got {} instead" + ) + self.fail(msg.format(embedded)) + except ValueError: + pass + data = np.arange(10, dtype=np.float64) + try: + embedded = measures.delay_embedding(data, 4, lag=4) + msg = ( + "embedding array of size 10 with embedding dimension 4 and " + "lag 4 should fail, got {} instead" + ) + self.fail(msg.format(embedded)) + except ValueError: + pass class TestNoldsUtility(unittest.TestCase): - """ - Tests for small utility functions that are part of the public API - """ - def test_binary_n(self): - x = nolds.binary_n(1000, min_n=50) - self.assertSequenceEqual(x, [500, 250, 125, 62]) + """Tests for small utility functions that are part of the public API.""" - def test_binary_n_empty(self): - x = nolds.binary_n(50, min_n=50) - self.assertSequenceEqual(x, []) + def test_binary_n(self) -> None: + """Hypothesis: binary_n produces exponentially declining numbers.""" + x = measures.binary_n(1000, min_n=50) + self.assertSequenceEqual(x, [500, 250, 125, 62]) - def test_logarithmic_n(self): - x = nolds.logarithmic_n(4, 11, 1.51) - self.assertSequenceEqual(x, [4, 6, 9]) + def test_binary_n_empty(self) -> None: + """Hypothesis: binary_n gives empty output if min_n is set too high.""" + x = measures.binary_n(50, min_n=50) + self.assertSequenceEqual(x, []) - def test_logarithmic_r(self): - x = nolds.logarithmic_r(4, 10, 1.51) - self.assertSequenceEqual(x, [4, 6.04, 9.1204]) + def test_logarithmic_n(self) -> None: + """Hypothesis: logarithmic_n outputs integers that follow an exponential series.""" + x = measures.logarithmic_n(4, 11, 1.51) + self.assertSequenceEqual(x, [4, 6, 9]) + def test_logarithmic_r(self) -> None: + """Hypothesis: logarithmic_r outputs floats that follow an exponential series.""" + x = measures.logarithmic_r(4, 10, 1.51) + self.assertSequenceEqual(x, [4, 6.04, 9.1204]) -class TestNoldsLyap(unittest.TestCase): - """ - Tests for lyap_e and lyap_r - """ - def test_lyap_logistic(self): - rvals = [2.5, 3.4, 3.7, 4.0] - sign = [-1, -1, 1, 1] - x0 = 0.1 - - def logistic(x, r): - return r * x * (1 - x) - - for r, s in zip(rvals, sign): - log = [] - x = x0 - for _ in range(100): - x = logistic(x, r) - log.append(x) - log = np.array(log, dtype="float32") - le = np.max(nolds.lyap_e(log, emb_dim=6, matrix_dim=2)) - lr = nolds.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) - self.assertEqual(s, int(np.sign(le)), "r = {}".format(r)) - self.assertEqual(s, int(np.sign(lr)), "r = {}".format(r)) - - def test_lyap_lorenz(self): - """Test hypothesis: Both lyap_r and lyap_e can reconstruct the largest Lyapunov exponent of the Lorenz system. - - The parameters for generating the Lorenz system were chosen to be as close as - possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) - and . - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and - similarities in the analysis of Lorenz, Chen, and Lu systems,” - Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, - doi: 10.1016/j.amc.2014.12.132. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - lyap_r_args = dict(min_tsep=10, emb_dim=5, tau=0.01, lag=5, trajectory_len=28, fit_offset=8, fit="poly") - lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) - lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) - lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) - lyap_e_args = dict(min_tsep=10, emb_dim=5, matrix_dim=5, tau=0.01, min_nb=8) - lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) - lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) - lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) - self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) - self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) - self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) - self.assertGreater(lyap_ex[0], 1.5) - self.assertGreater(lyap_ey[0], 1.5) - self.assertGreater(lyap_ez[0], 1.5) - - def test_lyap_fbm(self): - data = datasets.fbm(1000, H=0.3) - le = nolds.lyap_e(data, emb_dim=7, matrix_dim=3) - self.assertGreater(np.max(le), 0) - - def test_lyap_r_limits(self): - """ - tests if minimal input size is correctly calculated - """ - np.random.seed(0) - for i in range(10): - kwargs = { - "emb_dim": np.random.randint(1,10), - "lag": np.random.randint(1,6), - "min_tsep": np.random.randint(0,5), - "trajectory_len": np.random.randint(2,10) - } - min_len = nolds.lyap_r_len(**kwargs) - for i in reversed(range(max(1,min_len-5),min_len+5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_r(data, fit="poly", **kwargs) - msg = "{} data points should be required for kwargs {}, but " \ - + "{} where enough" - self.fail(msg.format( - min_len, - kwargs, - i - )) - except ValueError as e: - #print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but " \ - + " {} where too few" - try: - self.assertTrue( - np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), - msg.format(min_len, kwargs, i) - ) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e) - ) - def test_lyap_e_limits(self): - """ - tests if minimal input size is correctly calculated - """ - np.random.seed(1) - for i in range(10): - kwargs = { - "matrix_dim": np.random.randint(2,10), - "min_tsep": np.random.randint(0,10), - "min_nb": np.random.randint(2,15) - } - kwargs["emb_dim"] = np.random.randint(1,4) \ - * (kwargs["matrix_dim"] - 1) + 1 - min_len = nolds.lyap_e_len(**kwargs) - for i in reversed(range(max(1,min_len-5),min_len+5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_e(data, **kwargs) - msg = "{} data points should be required for kwargs {}, but " \ - + "{} where enough" - self.fail(msg.format( - min_len, - kwargs, - i - )) - except ValueError as e: - #print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but " \ - + " {} where too few" - try: - self.assertTrue( - np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), - msg.format(min_len, kwargs, i) - ) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e) - ) +class NoldsMeasure(Protocol): + """Protocol for typing methods that take a float array as first parameter.""" + + def __call__(self, data: measures.FloatArrayLike1D) -> Any: # noqa: ANN401 + """Call the measure.""" + + +class TestNoldsLyap(unittest.TestCase): + """Tests for lyap_e and lyap_r.""" + + def test_lyap_logistic(self) -> None: + """Hypothesis: The output of lyap_e and lyap_r on a logistic map has the correct sign.""" + rvals = [2.5, 3.4, 3.7, 4.0] + sign = [-1, -1, 1, 1] + x0 = 0.1 + + def logistic(x: float, r: float) -> float: + """Logistic map.""" + return r * x * (1 - x) + + for r, s in zip(rvals, sign, strict=True): + log = [] + x = x0 + for _ in range(100): + x = logistic(x, r) + log.append(x) + log = np.array(log, dtype=np.float64) + with self.subTest(measure="lyap_e", r=r): + le = np.max(measures.lyap_e(log, emb_dim=6, matrix_dim=2)) + self.assertEqual(s, np.sign(le)) + with self.subTest(measure="lyap_r", r=r): + lr = measures.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) + self.assertEqual(s, np.sign(lr)) + + def test_lyap_lorenz(self) -> None: + """Hypothesis: lyap_r and lyap_e match expected values for the Lorenz system. + + The parameters for generating the Lorenz system were chosen to be as close as + possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) + and . + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and + similarities in the analysis of Lorenz, Chen, and Lu systems,” + Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, + doi: 10.1016/j.amc.2014.12.132. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + lyap_r_args = { + "min_tsep": 10, + "emb_dim": 5, + "tau": 0.01, + "lag": 5, + "trajectory_len": 28, + "fit_offset": 8, + "fit": "poly", + } + lyap_e_args = { + "min_tsep": 10, + "emb_dim": 5, + "matrix_dim": 5, + "tau": 0.01, + "min_nb": 8, + } + with self.subTest(measure="lyap_r", axis="x"): + lyap_rx = measures.lyap_r(data[:, 0], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) + with self.subTest(measure="lyap_r", axis="y"): + lyap_ry = measures.lyap_r(data[:, 1], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) + with self.subTest(measure="lyap_r", axis="z"): + lyap_rz = measures.lyap_r(data[:, 2], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) + with self.subTest(measure="lyap_e", axis="x"): + lyap_ex = measures.lyap_e(data[:, 0], **lyap_e_args) + self.assertGreater(lyap_ex[0], 1.5) + with self.subTest(measure="lyap_e", axis="y"): + lyap_ey = measures.lyap_e(data[:, 1], **lyap_e_args) + self.assertGreater(lyap_ey[0], 1.5) + with self.subTest(measure="lyap_e", axis="z"): + lyap_ez = measures.lyap_e(data[:, 2], **lyap_e_args) + self.assertGreater(lyap_ez[0], 1.5) + + def test_lyap_fbm(self) -> None: + """Hypothesis: lyap_e produces positive output for fractional brownian motion.""" + data = datasets.fbm(1000, H=0.3) + le = measures.lyap_e(data, emb_dim=7, matrix_dim=3) + self.assertGreater(float(np.max(le)), 0) + + def assert_insufficient_length( + self, + min_len: int, + kwargs: dict[str, Any], + input_data: measures.FloatArray1D, + measure: NoldsMeasure, + ) -> None: + """Ensures that the length of the given data would actually lead to an error. + + Args: + min_len: reported minimum length + kwargs: kwargs to be passed to the measure + input_data: data with length less than `min_len` + measure: the nolds measure to test (either `lyap_r` or `lyap_e`) + """ + msg = ( + f"{min_len} data points should be required for kwargs {kwargs}, " + f"but {input_data.shape[0]} were enough" + ) + with self.assertRaises(ValueError, msg=msg), warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + measure(input_data, **kwargs) # pyright: ignore reportArgumentType + + def assert_sufficient_length( + self, + min_len: int, + kwargs: dict[str, Any], + input_data: measures.FloatArray1D, + measure: NoldsMeasure, + ) -> None: + """Ensures that the length of the given data does not lead to an error. + + Args: + min_len: reported minimum length + kwargs: kwargs to be passed to the measure + input_data: data with length at least `min_len` + measure: the nolds measure to test (either `lyap_r` or `lyap_e`) + """ + msg = ( + f"{min_len} data points should be enough for kwargs {kwargs}, but " + f"{input_data.shape[0]} were too few" + ) + try: + assert np.all(np.isfinite(measure(input_data, **kwargs))), msg + except ValueError as e: + raise ValueError(msg) from e + + def test_lyap_r_limits(self) -> None: + """Hypothesis: Minimal input size for lyap_r is correctly calculated. + + For each of 10 random parameter settings, we test a range of input sizes around + the supposed minimum number of inputs. For numbers smaller than the calculated + minimum we expect the call of lyap_r to fail, for numbers greater or equal, it + should succeed. + """ + rng = np.random.default_rng(seed=0) + for _ in range(10): + kwargs: dict[str, Any] = { + "emb_dim": rng.integers(1, 10), + "lag": rng.integers(1, 6), + "min_tsep": rng.integers(0, 5), + "trajectory_len": rng.integers(2, 10), + } + min_len = measures.lyap_r_len(**kwargs) # pyright: ignore reportArgumentType + kwargs["fit"] = "poly" + for actual_len in reversed(range(max(1, min_len - 5), min_len + 5)): + data = rng.random(actual_len) + with self.subTest( + emb_dim=kwargs["emb_dim"], + lag=kwargs["lag"], + min_tsep=kwargs["min_tsep"], + trajectory_len=kwargs["trajectory_len"], + min_len=min_len, + actual_len=actual_len, + ): + if actual_len < min_len: + ## too few data points => execution should fail + self.assert_insufficient_length( + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_r, + ) + else: + ## enough data points => execution should succeed + self.assert_sufficient_length( + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_r, + ) + + def test_lyap_e_limits(self) -> None: + """Tests if minimal input size is correctly calculated.""" + rng = np.random.default_rng(seed=1) + for _ in range(10): + kwargs = { + "matrix_dim": rng.integers(2, 10), + "min_tsep": rng.integers(0, 10), + "min_nb": rng.integers(2, 15), + } + kwargs["emb_dim"] = rng.integers(1, 4) * (kwargs["matrix_dim"] - 1) + 1 + min_len = measures.lyap_e_len(**kwargs) # pyright: ignore reportArgumentType + for actual_len in reversed(range(max(1, min_len - 5), min_len + 5)): + data = rng.random(actual_len) + with self.subTest( + matrix_dim=kwargs["matrix_dim"], + min_tsep=kwargs["min_tsep"], + min_nb=kwargs["min_nb"], + min_len=min_len, + actual_len=actual_len, + ): + if actual_len < min_len: + ## too few data points => execution should fail + self.assert_insufficient_length( + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_e, + ) + else: + ## enough data points => execution should succeed + self.assert_sufficient_length( + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_e, + ) class TestNoldsHurst(unittest.TestCase): - """ - Tests for hurst_rs - """ - def test_hurst_basic(self): - np.random.seed(2) - # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() - for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = nolds.hurst_rs(seq_neg) - #print("h_neg = %.3f" % h_neg) - # expected h is around 0 - self.assertLess(h_neg, 0.3) - - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = nolds.hurst_rs(x) - #print("h_rand = %.3f" % h_rand) - # expected h is around 0.5 - self.assertAlmostEqual(h_rand, 0.5, delta=0.1) - - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = nolds.hurst_rs(walk) - #print("h_walk = %.3f" % h_walk) - # expected h is around 1.0 - self.assertGreater(h_walk, 0.9) - - def test_hurst_pracma(self): - """ - Tests for hurst_rs using the same tests as in the R-package pracma - """ - np.random.seed(3) - # This test reproduces the results presented by Ian L. Kaplan on - # bearcave.com - h72 = nolds.hurst_rs( - datasets.brown72, fit="poly", corrected=False, unbiased=False, - nvals=2**np.arange(3,11)) - #print("h72 = %.3f" % h72) - self.assertAlmostEqual(h72, 0.72, delta=0.01) - - xgn = np.random.normal(size=10000) - hgn = nolds.hurst_rs(xgn, fit="poly") - #print("hgn = %.3f" % hgn) - self.assertAlmostEqual(hgn, 0.5, delta=0.1) - - xlm = np.fromiter(datasets.logistic_map(0.1,1024),dtype="float32") - hlm = nolds.hurst_rs(xlm, fit="poly", nvals=2**np.arange(3,11)) - #print("hlm = %.3f" % hlm) - self.assertAlmostEqual(hlm, 0.43, delta=0.05) - - def test_hurst_lorenz(self): - """Test hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series - Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, - 2009, doi: 10.1007/s11207-009-9467-x. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - hurst_rs_args = dict(fit="poly", nvals=nolds.logarithmic_n(10, 70, 1.1)) - hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) - hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) - hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) - self.assertAlmostEqual(0.9, hx, delta=0.05) - self.assertAlmostEqual(0.9, hy, delta=0.05) - self.assertAlmostEqual(0.9, hz, delta=0.05) + """Tests for hurst_rs.""" + + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=2) + # strong negative correlation between successive elements + cls.negative_correlation = [] + x = rng.random() + for _ in range(10000): + x = -x + rng.random() - 0.5 + cls.negative_correlation.append(x) + # no correlation, just gaussian noise + cls.no_correlation = rng.standard_normal(10000) + # cumulative sum has strong positive correlation between + # elements + cls.positive_correlation = np.cumsum(cls.no_correlation) + + def test_hurst_negative_correlation(self) -> None: + """Hypothesis: H < 0.5 for data with negative correlation between successive elements.""" + h_neg = measures.hurst_rs(self.negative_correlation) + # expected h is around 0 + self.assertLess(h_neg, 0.3) + + def test_hurst_gaussian_noise(self) -> None: + """Hypothesis: H ~= 0.5 for gaussian noise.""" + h_rand = measures.hurst_rs(self.no_correlation) + # expected h is around 0.5 + self.assertAlmostEqual(h_rand, 0.5, delta=0.1) + + def test_hurst_positive_correlation(self) -> None: + """Hypothesis: H > 0.5 for data with positive correlation between successive elements.""" + h_walk = measures.hurst_rs(self.positive_correlation) + # expected h is around 1.0 + self.assertGreater(h_walk, 0.9) + + def test_hurst_pracma_bearcave(self) -> None: + """Hypothesis: `hurst_rs` passes test from R-package pracma using brown72 dataset.""" + # This test reproduces the results presented by Ian L. Kaplan on + # http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html + h72 = measures.hurst_rs( + datasets.brown72, + fit="poly", + corrected=False, + unbiased=False, + nvals=2 ** np.arange(3, 11), + ) + self.assertAlmostEqual(h72, 0.72, delta=0.01) + + def test_hurst_pracma_logistic(self) -> None: + """Hypothesis: `hurst_rs` passes test from R-package pracma using logistic map.""" + xlm = np.fromiter(datasets.logistic_map(0.1, 1024), dtype=np.float64) + hlm = measures.hurst_rs(xlm, fit="poly", nvals=2 ** np.arange(3, 11)) + self.assertAlmostEqual(hlm, 0.43, delta=0.05) + + def test_hurst_lorenz(self) -> None: + """Hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series + Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, + 2009, doi: 10.1007/s11207-009-9467-x. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + hurst_rs_args = {"fit": "poly", "nvals": measures.logarithmic_n(10, 70, 1.1)} + with self.subTest(axis="x"): + hx = measures.hurst_rs(data[:, 0], **hurst_rs_args) + self.assertAlmostEqual(0.9, hx, delta=0.05) + with self.subTest(axis="y"): + hy = measures.hurst_rs(data[:, 1], **hurst_rs_args) + self.assertAlmostEqual(0.9, hy, delta=0.05) + with self.subTest(axis="z"): + hz = measures.hurst_rs(data[:, 2], **hurst_rs_args) + self.assertAlmostEqual(0.9, hz, delta=0.05) -class TestNoldsDFA(unittest.TestCase): - """ - Tests for dfa - """ - def test_dfa_base(self): - np.random.seed(4) - # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() - for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = nolds.dfa(seq_neg) - # expected h is around 0 - self.assertLess(h_neg, 0.3) - - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = nolds.dfa(x) - # expected h is around 0.5 - self.assertLess(h_rand, 0.7) - self.assertGreater(h_rand, 0.3) - - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = nolds.dfa(walk) - # expected h is around 1.0 - self.assertGreater(h_walk, 0.7) - - def test_dfa_fbm(self): - hs = [0.3, 0.5, 0.7] - for h in hs: - data = datasets.fbm(1000, H=h) - he = nolds.dfa(data) - self.assertAlmostEqual(he, h + 1, delta=0.15) - - def test_dfa_lorenz(self): - """Test hypothesis: We get correct values for estimating the Hurst parameter of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, - and M. Dietz, “A Multivariate Method for Dynamic System Analysis: - Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” - Topics in Cognitive Science, p. tops.12688, Sep. 2023, - doi: 10.1111/tops.12688. - """ - data = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] - nvals = nolds.logarithmic_n(200, len(data)/8, 2**0.2) - dfa_args = dict(nvals=nvals, order=2, overlap=False, fit_exp="poly") - dx = nolds.dfa(data[:, 0], **dfa_args) - dy = nolds.dfa(data[:, 1], **dfa_args) - dz = nolds.dfa(data[:, 2], **dfa_args) - self.assertAlmostEqual(1.008, dx, delta=0.04) - self.assertAlmostEqual(0.926, dy, delta=0.032) - self.assertAlmostEqual(0.650, dz, delta=0.44) - - def test_dfa_agreement_with_physionet(self): - """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" - lorenz_x, physionet_points = datasets.load_lorenz_physionet() - nvals = [round(x) for x in 10 ** physionet_points[:,0]] - _, (_, nolds_rs, _) = nolds.dfa(lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True) - nolds_rs_log10 = nolds_rs / np.log(10) - # assert that sum of squared errors is less than 1e-9 - self.assertLess(sum((physionet_points[:,1] - nolds_rs_log10)**2), 1e-9) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") - def test_dfa_levy(self): - """Test hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. - - Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. - """ - alpha = 1.5 - x = levy_stable.rvs(alpha=alpha, beta=0, size=10000) - h = nolds.dfa(x, fit_exp="poly") - self.assertAlmostEqual(0.5, h, delta=0.1) +class TestNoldsDFA(unittest.TestCase): + """Tests for dfa.""" + + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=4) + # strong negative correlation between successive elements + cls.negative_correlation = [] + x = rng.random() + for _ in range(10000): + x = -x + rng.random() - 0.5 + cls.negative_correlation.append(x) + # no correlation, just gaussian noise + cls.no_correlation = rng.standard_normal(10000) + # cumulative sum has strong positive correlation between + # elements + cls.positive_correlation = np.cumsum(cls.no_correlation) + + def test_dfa_negative_correlation(self) -> None: + """Hypothesis: H < 0.5 for data with negative correlation between successive elements.""" + h_neg = measures.dfa(self.negative_correlation) + # expected h is around 0 + self.assertLess(h_neg, 0.3) + + def test_dfa_no_correlation(self) -> None: + """Hypothesis: H ~= 0.5 for gaussian noise.""" + h_rand = measures.dfa(self.no_correlation) + self.assertAlmostEqual(h_rand, 0.5, delta=0.2) + + def test_dfa_positive_correlation(self) -> None: + """Hypothesis: H > 0.5 for data with positive correlation between successive elements.""" + h_walk = measures.dfa(self.positive_correlation) + # expected h is around 1.0 + self.assertGreater(h_walk, 0.7) + + def test_dfa_fbm(self) -> None: + """Hypothesis: H ~= h + 1 for fractional brownian motion with Hurst parameter h.""" + hs = [0.3, 0.5, 0.7] + for h in hs: + with self.subTest(h=h): + data = datasets.fbm(1000, H=h) + he = measures.dfa(data) + self.assertAlmostEqual(he, h + 1, delta=0.15) + + def test_dfa_lorenz(self) -> None: + """Hypothesis: We get correct values for the Lorenz system. + + All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, + and M. Dietz, “A Multivariate Method for Dynamic System Analysis: + Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” + Topics in Cognitive Science, p. tops.12688, Sep. 2023, + doi: 10.1111/tops.12688. + """ + data = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ + 20000: + ] + nvals = measures.logarithmic_n(200, np.ceil(len(data) / 8), 2**0.2) + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} + with self.subTest(axis="x"): + dx = measures.dfa(data[:, 0], **dfa_args) + self.assertAlmostEqual(1.008, dx, delta=0.04) + with self.subTest(axis="y"): + dy = measures.dfa(data[:, 1], **dfa_args) + self.assertAlmostEqual(0.926, dy, delta=0.032) + with self.subTest(axis="z"): + dz = measures.dfa(data[:, 2], **dfa_args) + self.assertAlmostEqual(0.650, dz, delta=0.44) + + def test_dfa_agreement_with_physionet(self) -> None: + """Hypothesis: The output of nolds is identical to the output of PhysioNet.""" + lorenz_x, physionet_points = datasets.load_lorenz_physionet() + nvals = [round(x) for x in 10 ** physionet_points[:, 0]] + _, (_, nolds_rs, _) = measures.dfa( + lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True + ) + nolds_rs_log10 = nolds_rs / np.log(10) + with self.subTest(kind="individual"): + assert_almost_equal(nolds_rs_log10, physionet_points[:, 1], decimal=5) + with self.subTest(kind="sse"): + # assert that sum of squared errors is less than 1e-9 + sse = sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) + self.assertLess(sse, 1e-09) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") + def test_dfa_levy(self) -> None: + """Hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. + + Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. + """ + alpha = 1.5 + x = cast("np.typing.NDArray[Any]", levy_stable.rvs(alpha=alpha, beta=0, size=10000)) + h = measures.dfa(x, fit_exp="poly") + self.assertAlmostEqual(0.5, h, delta=0.1) class TestNoldsCorrDim(unittest.TestCase): - """ - Tests for corr_dim - """ - def test_corr_dim(self): - np.random.seed(5) - n = 1000 - data = np.arange(n) - cd = nolds.corr_dim(data, 4) - self.assertAlmostEqual(cd, 1, delta=0.05) - # TODO what is the prescribed correlation dimension for random data? - data = np.random.random(n) - cd = nolds.corr_dim(data, 4, fit="poly") - self.assertAlmostEqual(cd, 0.5, delta=0.15) - # TODO test example for cd > 1 - - def test_lorenz(self): - """Test hypothesis: We get correct values for estimating the correlation dimension of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Grassberger and Procaccia (1983) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. The settings of n, discard, - lag, emb_dim, and rvals were determined experimentally to find the smallest - dataset that yields the results reported. - - .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness - of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - """ - discard = 5000 - n = 5000 - lag = 10 - emb_dim = 5 - data = datasets.lorenz_euler(n + discard, 10, 28, 8/3, start=(1,1,1), dt=0.012) - x = data[discard:,1] - rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) - self.assertAlmostEqual(cd, 2.05, delta=0.2) - - def test_logistic(self): - # TODO replicate tests with logistic map from grassberger-procaccia - pass + """Tests for corr_dim.""" + + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=5) + n = 1000 + cls.cd1 = np.arange(n) + # TODO: what is the prescribed correlation dimension for random data? + cls.cd0p5 = rng.random(n) + + def test_corr_dim_1(self) -> None: + """Hypothesis: Correlation dimensions is close to 1 for highly correlated dataset.""" + cd = measures.corr_dim(self.cd1, 4) + self.assertAlmostEqual(cd, 1, delta=0.05) + + def test_corr_dim_0p5(self) -> None: + """Hypothesis: Correlation dimension is close to 0.5 for dataset without correlations.""" + cd = measures.corr_dim(self.cd0p5, 4, fit="poly") + self.assertAlmostEqual(cd, 0.5, delta=0.15) + + # TODO: test example for cd > 1 + + def test_lorenz(self) -> None: + """Hypothesis: We get correct values for the Lorenz system. + + All parameter values are chosen to replicate the experiment by Grassberger and Procaccia + (1983) as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. The settings of n, discard, + lag, emb_dim, and rvals were determined experimentally to find the smallest + dataset that yields the results reported. + + .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness + of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + """ + discard = 5000 + n = 5000 + lag = 10 + emb_dim = 5 + data = datasets.lorenz_euler(n + discard, 10, 28, 8 / 3, start=[1, 1, 1], dt=0.012) + x = data[discard:, 1] + rvals = measures.logarithmic_r(1, np.e, 1.1) # determined experimentally + cd = measures.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) + self.assertAlmostEqual(cd, 2.05, delta=0.2) + + def test_logistic(self) -> None: + """Hypothesis: We get correct values for the logistic map.""" + # TODO: replicate tests with logistic map from grassberger-procaccia class TestNoldsSampEn(unittest.TestCase): - """ - Tests for sampen - """ - def test_sampen_base(self): - data = [0, 1, 5, 4, 1, 0, 1, 5, 3] - # matches for m=2: 01-01, 15-15 - # matches for m=3: 015-015 - se = nolds.sampen(data) - self.assertAlmostEqual(se, -np.log(1.0/2), delta=0.01) - data = [1, 2, 1, 2.4, 1, 4] - # matches for m=1: 1-1,1-1,2-2.4,1-1 - # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] - se = nolds.sampen(data, emb_dim=1, tolerance=0.5) - self.assertAlmostEqual(se, -np.log(2.0/4), delta=0.01) - data = [0, 20, 1, 2, 3, 4, 40, 60, 1.4, 2.4, 3.4, 80, 100, 1.4, 2.4, 3.4, - 4, 120, 140, 180] - # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], - # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] - # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] - se = nolds.sampen(data, emb_dim=3, tolerance=0.5) - self.assertAlmostEqual(se, -np.log(1.0/4), delta=0.01) - - def test_sampen_logistic(self): - # logistic map with r = 2.8 => static value - data = list(datasets.logistic_map(0.45, 1000, r=2.8)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.3 => oscillation between two values - data = list(datasets.logistic_map(0.45, 1000, r=3.3)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.5 => oscillation between four values - data = list(datasets.logistic_map(0.45, 1000, r=3.5)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.9 => chaotic behavior - data = list(datasets.logistic_map(0.45, 1000, r=3.9)) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:]), delta=0.1) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:], emb_dim=5), delta=0.1) - - def test_sampen_random(self): - np.random.seed(6) - # normally distributed random numbers - data = np.random.randn(10000) - self.assertAlmostEqual(2.2, nolds.sampen(data), delta=0.1) - self.assertAlmostEqual(2.2, nolds.sampen(data, emb_dim=2), delta=0.1) - # TODO add tests with uniformly distributed random numbers - - def test_sampen_sinus(self): - # TODO add test with sinus signal - pass - - - def test_sampen_lorenz(self): - """Test hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, - “The effect of time delay on Approximate & Sample Entropy - calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, - pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - sampen_args = dict(emb_dim=2, lag=1) - sx = nolds.sampen(data[:, 0], **sampen_args) - sy = nolds.sampen(data[:, 1], **sampen_args) - sz = nolds.sampen(data[:, 2], **sampen_args) - self.assertAlmostEqual(0.15, sx, delta=0.05) - self.assertAlmostEqual(0.15, sy, delta=0.05) - self.assertAlmostEqual(0.25, sz, delta=0.05) + """Tests for sampen.""" + + def test_sampen_2(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=2`.""" + data = [0, 1, 5, 4, 1, 0, 1, 5, 3] + # matches for m=2: 01-01, 15-15 + # matches for m=3: 015-015 + se = measures.sampen(data) + self.assertAlmostEqual(se, -np.log(1.0 / 2), delta=0.01) + + def test_sampen_1(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=1`.""" + data = [1, 2, 1, 2.4, 1, 4] + # matches for m=1: 1-1,1-1,2-2.4,1-1 + # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] + se = measures.sampen(data, emb_dim=1, tolerance=0.5) + self.assertAlmostEqual(se, -np.log(2.0 / 4), delta=0.01) + + def test_sampen_3(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=3`.""" + data = [ + 0, + 20, + 1, + 2, + 3, + 4, + 40, + 60, + 1.4, + 2.4, + 3.4, + 80, + 100, + 1.4, + 2.4, + 3.4, + 4, + 120, + 140, + 180, + ] + # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], + # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] # noqa: ERA001 + # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] + se = measures.sampen(data, emb_dim=3, tolerance=0.5) + self.assertAlmostEqual(se, -np.log(1.0 / 4), delta=0.01) + + def test_sampen_logistic_static(self) -> None: + """Hypothesis: `sampen` gives correct outputs for logistic map with static value.""" + # logistic map with r = 2.8 => static value + data = list(datasets.logistic_map(0.45, 1000, r=2.8)) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_oscillation_2(self) -> None: + """Hypothesis: `sampen` is correct for logistic map oscillating between 2 values.""" + # logistic map with r = 3.3 => oscillation between two values + data = list(datasets.logistic_map(0.45, 1000, r=3.3)) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_oscillation_4(self) -> None: + """Hypothesis: `sampen` is correct for logistic map oscillating between 4 values.""" + # logistic map with r = 3.5 => oscillation between four values + data = list(datasets.logistic_map(0.45, 1000, r=3.5)) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_chaotic(self) -> None: + """Hypothesis: `sampen` is correct for logistic map with chaotic behavior.""" + # logistic map with r = 3.9 => chaotic behavior + data = list(datasets.logistic_map(0.45, 1000, r=3.9)) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0.5, measures.sampen(data[100:]), delta=0.1) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0.5, measures.sampen(data[100:], emb_dim=5), delta=0.1) + + def test_sampen_gaussian(self) -> None: + """Hypothesis: `sampen` is correct for gaussian noise.""" + rng = np.random.default_rng(seed=6) + # normally distributed random numbers + data = rng.standard_normal(10000) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(2.2, measures.sampen(data), delta=0.1) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(2, measures.sampen(data[100:], emb_dim=5), delta=0.1) + + def test_sampen_sinus(self) -> None: + """Hypothesis: `sampen` is correct for a sinus signal.""" + # TODO: add test with sinus signal + + def test_sampen_lorenz(self) -> None: + """Hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, + “The effect of time delay on Approximate & Sample Entropy + calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, + pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + sampen_args = {"emb_dim": 2, "lag": 1} + with self.subTest(axis="x"): + sx = measures.sampen(data[:, 0], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.15, sx, delta=0.05) + with self.subTest(axis="y"): + sy = measures.sampen(data[:, 1], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.15, sy, delta=0.05) + with self.subTest(axis="z"): + sz = measures.sampen(data[:, 2], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.25, sz, delta=0.05) class RegressionTests(unittest.TestCase): - """Regression tests for main algorithms. - - These tests are here to safeguard against accidental algorithmic changes such - as updates to core dependencies such as numpy or the Python standard library. - """ - - def test_sampen(self): - """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - se = nolds.sampen(data, emb_dim=2, tolerance=None, lag=1, dist=nolds.rowwise_chebyshev, closed=False) - self.assertAlmostEqual(2.1876999522832743, se, places=14) - - def test_corr_dim(self): - """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=None, dist=nolds.rowwise_euclidean, fit="poly") - self.assertAlmostEqual(1.303252839255068, cd, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_corr_dim_RANSAC(self): - """Test hypothesis: The exact output of corr_dim() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - sd = np.std(data, ddof=1) - # fix seed - np.random.seed(42) - # usa a too wide range for rvals to give RANSAC something to do ;) - rvals = nolds.logarithmic_r(0.01 * sd, 2 * sd, 1.03) - cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=rvals, dist=nolds.rowwise_euclidean, fit="RANSAC") - self.assertAlmostEqual(0.44745494643404665, cd, places=14) - - def test_lyap_e(self): - """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - le = nolds.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) - expected = np.array([ 0.03779942603329712, -0.014314012551504982, -0.08436867977030214, -0.22316730257003717]) - for i in range(le.shape[0]): - self.assertAlmostEqual(expected[i], le[i], places=14, msg=f"{i+1}th Lyapunov exponent doesn't match") - - def test_lyap_r(self): - """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - le = nolds.lyap_r(data, emb_dim=10, lag=1, min_tsep=1, tau=1, min_neighbors=10, trajectory_len=10, fit="poly") - expected = 0.094715945307378 - self.assertAlmostEqual(expected, le, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_lyap_r_RANSAC(self): - """Test hypothesis: The exact output of lyap_r() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - # set lag to 2 for weird duplicate lines - # set trajectory_len to 100 to get many datapoints for RANSAC to choose from - le = nolds.lyap_r(data, emb_dim=10, lag=2, min_tsep=1, tau=1, min_neighbors=10, trajectory_len=100, fit="RANSAC") - expected = 0.0003401212353253564 - self.assertAlmostEqual(expected, le, places=14) - - def test_hurst_rs(self): - """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - rs = nolds.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) - expected = 0.5123887964986258 - self.assertAlmostEqual(expected, rs, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_hurst_rs_RANSAC(self): - """Test hypothesis: The exact output of hurst_rs() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - # increase nsteps in nvals to have more data points for RANSAC to choose from - nvals = nolds.logmid_n(data.shape[0], ratio=1/4.0, nsteps=100) - rs = nolds.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) - expected = 0.4805431939943321 - self.assertAlmostEqual(expected, rs, places=14) - - def test_dfa(self): - """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = nolds.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") - expected = 0.5450874638765073 - self.assertAlmostEqual(expected, h, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_dfa_RANSAC(self): - """Test hypothesis: The exact output of dfa() with `fit_exp=RANSAC` on random data hasn't changed since the last version.""" - # adds trend to data to introduce a less clear line for fitting - data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 - np.random.seed(42) - # adds more steps and higher values to nvals to introduce some scattering for RANSAC to have an effect on - nvals = nolds.logarithmic_n(10, 0.9 * data.shape[0], 1.1) - h = nolds.dfa(data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC") - expected = 1.1372303125405405 - self.assertAlmostEqual(expected, h, places=14) - - def test_mfhurst_b(self): - """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="poly") - expected = [-0.00559398934417339] - self.assertAlmostEqual(expected[0], h[0], places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_b_RANSAC(self): - """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") - expected = [-0.009056463064211057] - self.assertAlmostEqual(expected[0], h[0], places=14) - - def test_mfhurst_dm(self): - """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly") - expected = [0.008762803881203145] - self.assertAlmostEqual(expected[0], h[0], places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_dm_RANSAC(self): - """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC") - expected = [0.005324834328837356] - self.assertAlmostEqual(expected[0], h[0], places=14) + """Regression tests for main algorithms. + + These tests are here to safeguard against accidental algorithmic changes such + as updates to core dependencies such as numpy or the Python standard library. + """ + + @classmethod + def setUpClass(cls) -> None: + """Loads random data for tests.""" + cls.random_data = datasets.load_qrandom()[:1000] + + def test_sampen(self) -> None: + """Hypothesis: The exact output of sampen remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + se = measures.sampen( + self.random_data, + emb_dim=2, + tolerance=None, + lag=1, + dist=measures.rowwise_chebyshev, + closed=False, + ) + self.assertAlmostEqual(2.1876999522832743, se, places=14) + + def test_corr_dim(self) -> None: + """Hypothesis: The exact output of corr_dim with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + cd = measures.corr_dim( + self.random_data, + emb_dim=5, + lag=1, + rvals=None, + dist=measures.rowwise_euclidean, + fit="poly", + ) + self.assertAlmostEqual(0.0810185360746645, cd, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_corr_dim_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of corr_dim with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + sd = float(np.std(self.random_data, ddof=1)) + # usa a too wide range for rvals to give RANSAC something to do ;) + rvals = measures.logarithmic_r(0.01 * sd, 2 * sd, 1.03) + cd = measures.corr_dim( + self.random_data, + emb_dim=5, + lag=1, + rvals=rvals, + dist=measures.rowwise_euclidean, + fit="RANSAC", + random_state=42, + ) + self.assertAlmostEqual(0.0008971209283844629, cd, places=14) + + def test_lyap_e(self) -> None: + """Hypothesis: The exact output of lyap_e remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + le = measures.lyap_e( + self.random_data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1 + ) + expected = np.array( + [ + 0.03779942603329712, + -0.014314012551504982, + -0.08436867977030214, + -0.22316730257003717, + ] + ) + assert_almost_equal(le, expected, decimal=14) + + def test_lyap_r(self) -> None: + """Hypothesis: The exact output of lyap_r with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + le = measures.lyap_r( + self.random_data, + emb_dim=10, + lag=1, + min_tsep=1, + tau=1, + min_neighbors=10, + trajectory_len=10, + fit="poly", + ) + expected = 0.094715945307378 + self.assertAlmostEqual(expected, le, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_lyap_r_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of lyap_r with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + # set lag to 2 for weird duplicate lines + # set trajectory_len to 100 to get many datapoints for RANSAC to choose from + le = measures.lyap_r( + self.random_data, + emb_dim=10, + lag=2, + min_tsep=1, + tau=1, + min_neighbors=10, + trajectory_len=100, + fit="RANSAC", + random_state=42, + ) + expected = 0.0003401212353253564 + self.assertAlmostEqual(expected, le, places=14) + + def test_hurst_rs(self) -> None: + """Hypothesis: The exact output of hurst_rs with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + rs = measures.hurst_rs( + self.random_data, nvals=None, fit="poly", corrected=True, unbiased=True + ) + expected = 0.5123887964986258 + self.assertAlmostEqual(expected, rs, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_hurst_rs_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of hurst_rs with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + # increase nsteps in nvals to have more data points for RANSAC to choose from + nvals = measures.logmid_n(self.random_data.shape[0], ratio=1 / 4.0, nsteps=100) + rs = measures.hurst_rs( + self.random_data, + nvals=nvals, + fit="RANSAC", + corrected=True, + unbiased=True, + random_state=42, + ) + expected = 0.4805431939943321 + self.assertAlmostEqual(expected, rs, places=14) + + def test_dfa(self) -> None: + """Hypothesis: The exact output of dfa with `fit_exp=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.dfa( + self.random_data, + nvals=None, + overlap=True, + order=1, + fit_trend="poly", + fit_exp="poly", + ) + expected = 0.5450874638765073 + self.assertAlmostEqual(expected, h, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_dfa_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of dfa with `fit_exp=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + # adds trend to data to introduce a less clear line for fitting + random_data = self.random_data + np.arange(1000) * 100 + # adds more steps and higher values to nvals to introduce some scattering + # for RANSAC to have an effect on + nvals = measures.logarithmic_n(10, 0.9 * random_data.shape[0], 1.1) + h = measures.dfa( + random_data, + nvals=nvals, + overlap=True, + order=1, + fit_trend="poly", + fit_exp="RANSAC", + random_state=42, + ) + expected = 1.1372303125405405 + self.assertAlmostEqual(expected, h, places=14) + + def test_mfhurst_b(self) -> None: + """Hypothesis: The exact output of mfhurst_b with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.mfhurst_b(self.random_data, qvals=[1], dists=None, fit="poly") + expected = [-0.00559398934417339] + self.assertAlmostEqual(expected[0], h[0], places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_mfhurst_b_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of mfhurst_b with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.mfhurst_b( + self.random_data, qvals=[1], dists=None, fit="RANSAC", random_state=42 + ) + expected = [-0.009056463064211057] + self.assertAlmostEqual(expected[0], h[0], places=14) + + def test_mfhurst_dm(self) -> None: + """Hypothesis: The exact output of mfhurst_dm with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h, _ = measures.mfhurst_dm( + self.random_data, + qvals=[1], + max_dists=range(5, 20), + detrend=True, + fit="poly", + ) + expected = [0.008762803881203145] + self.assertAlmostEqual(expected[0], h[0], places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_mfhurst_dm_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of mfhurst_dm with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h, _ = measures.mfhurst_dm( + self.random_data, + qvals=[1], + max_dists=range(5, 20), + detrend=True, + fit="RANSAC", + random_state=42, + ) + expected = [0.0068840609945006685] + self.assertAlmostEqual(expected[0], h[0], places=14) class PreviousDefectTests(unittest.TestCase): - """Tests that ensure that a previous bug doesn't come back at some point.""" + """Tests that ensure that a previous bug doesn't come back at some point.""" + + def test_lyap_r_complex_min_tsep(self) -> None: + """Hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. + + Previously, this would lead to an exception in the code. See + https://github.com/CSchoel/nolds/issues/53 for reference. + """ + data = np.cos(np.arange(100) * 0.01) + # previously this would fail with the following exception: + # TypeError: ufunc 'ceil' not supported for the input types, and the + # inputs could not be safely coerced to any supported types according to + # the casting rule ''safe'' + measures.lyap_r(data) - def test_lyap_r_complex_min_tsep(self): - """Test hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. - - Previously, this would lead to an exception in the code. See - https://github.com/CSchoel/nolds/issues/53 for reference. - """ - data = np.cos(np.arange(100)*0.01) - # previously this would fail with the following exception: - # TypeError: ufunc 'ceil' not supported for the input types, and the - # inputs could not be safely coerced to any supported types according to - # the casting rule ''safe'' - nolds.lyap_r(data) if __name__ == "__main__": - unittest.main() + unittest.main() diff --git a/pyproject.toml b/pyproject.toml index 314114a..a347800 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "nolds" version = "0.6.2" description = "Nonlinear measures for dynamical systems (based on one-dimensional time series)" authors = [ - {name= "Christopher Schölzel", email= "christopher.schoelzel@mailbox.org"} + { name = "Christopher Schölzel", email = "christopher.schoelzel@mailbox.org" }, ] license = "MIT" license-files = ["LICENSE.txt"] @@ -18,7 +18,7 @@ keywords = [ "DFA", "detrended fluctuation analysis", "sample entropy", - "correlation dimension" + "correlation dimension", ] classifiers = [ "Development Status :: 5 - Production/Stable", @@ -27,19 +27,15 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12" + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] readme = "README.rst" -requires-python = ">=3.8" -dependencies = [ - "numpy>1.0,<3.0", - "future>=1.0", - "setuptools>=72.1.0" -] +# NOTE: This also informs ruff which Python version to target (no need for explicit target-version) +requires-python = ">=3.10" +dependencies = ["numpy>1.0,<3.0", "future>=1.0", "setuptools>=72.1.0"] [project.optional-dependencies] RANSAC = ["scikit-learn>=0.19"] @@ -64,11 +60,8 @@ build-backend = "hatchling.build" [tool.ruff] -line-length = 100 # allow slightly longer lines -indent-width = 2 # use two spaces for now to not upset existing code - -# Assume Python 3.8 -target-version = "py38" +line-length = 100 # allow slightly longer lines +indent-width = 4 [tool.ruff.format] # Like Black, use double quotes for strings. @@ -81,16 +74,25 @@ indent-style = "space" # Select all rules by default select = ["ALL"] ignore = [ - "TD002", # adding an author to TODOs wastes space and is redundant because of git history - "TD003", # once there is an issue, there is no need to keep the TODO => don't require links + "TD002", # adding an author to TODOs wastes space and is redundant because of git history + "TD003", # once there is an issue, there is no need to keep the TODO => don't require links + "COM812", # missing trailing commas are already fixed by the formatter + "PLR0913", # this is a scientific library, functions just have a bazillion parameters + "S101", # nolds uses assers as legitimate checks for programming (not user) errors + "N806", # some variables are kept close to the papers, which means they will have uppercase letters in them + "N803", # same as for variables: We use conventions from scientific papers, which inlcude uppercase letters ] +allowed-confusables = [ + "–", # en-dashes are used for bibliographical references in docstrings +] + [tool.ruff.lint.pydocstyle] convention = "google" [tool.ruff.lint.per-file-ignores] # source: https://github.com/astral-sh/ruff/issues/4368#issue-1705468153 -"test/**/*.py" = [ +"/**/test_*.py" = [ # at least this three should be fine in tests: "S101", # asserts allowed in tests... "ARG", # Unused function args -> fixtures nevertheless are functionally relevant... @@ -99,6 +101,7 @@ convention = "google" "PLR2004", # Magic value used in comparison, ... "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes "INP001", # Test folders should not have an `__init__.py` + "PT", # We're not using pytest, so we want to use the unittest assert methods ] [[tool.uv.index]]