From 3a8df1b4673eed93a8fcec8c3cd82ede3be53e39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Wed, 11 Jun 2025 21:45:02 +0200 Subject: [PATCH 01/36] fixes all linting errors reported by ruff that can be fixed automatically --- doc/source/conf.py | 57 +++++---- nolds/__init__.py | 37 +++++- nolds/datasets.py | 67 ++++------- nolds/examples.py | 207 ++++++++++++-------------------- nolds/measures.py | 263 ++++++++++++++++++----------------------- nolds/test_measures.py | 238 ++++++++++++++++--------------------- pyproject.toml | 2 +- 7 files changed, 374 insertions(+), 497 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 345747a..3fa3297 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # Nolds documentation build configuration file, created by # sphinx-quickstart on Wed Aug 10 17:47:20 2016. @@ -13,14 +12,14 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. #sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('../..')) +sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ @@ -31,47 +30,47 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.coverage', - 'sphinx.ext.mathjax', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', + "sphinx.ext.autodoc", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. #source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'Nolds' -copyright = u'2016-2024, Christopher Schölzel' -author = u'Christopher Schölzel' +project = "Nolds" +copyright = "2016-2024, Christopher Schölzel" +author = "Christopher Schölzel" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = '0.6' +version = "0.6" # The full version, including alpha/beta/rc tags. -release = '0.6.2' +release = "0.6.2" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: @@ -99,7 +98,7 @@ #show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. #modindex_common_prefix = [] @@ -115,13 +114,13 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'alabaster' +html_theme = "alabaster" # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. html_theme_options = { - "page_width": "1100px" + "page_width": "1100px", } # Add any paths that contain custom themes here, relative to this directory. @@ -209,7 +208,7 @@ #html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'Noldsdoc' +htmlhelp_basename = "Noldsdoc" # -- Options for LaTeX output --------------------------------------------- @@ -231,8 +230,8 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'Nolds.tex', 'Nolds Documentation', - u'Christopher Schölzel', 'manual'), + (master_doc, "Nolds.tex", "Nolds Documentation", + "Christopher Schölzel", "manual"), ] # The name of an image file (relative to this directory) to place at the top of @@ -261,8 +260,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'nolds', 'Nolds Documentation', - [author], 1) + (master_doc, "nolds", "Nolds Documentation", + [author], 1), ] # If true, show URL addresses after external links. @@ -275,9 +274,9 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'Nolds', 'Nolds Documentation', - author, 'Nolds', 'One line description of project.', - 'Miscellaneous'), + (master_doc, "Nolds", "Nolds Documentation", + author, "Nolds", "One line description of project.", + "Miscellaneous"), ] # Documents to append as an appendix to all manuals. @@ -292,4 +291,4 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False -# autodoc_mock_imports = ['numpy', 'future', 'setuptools', 'builtins'] \ No newline at end of file +# autodoc_mock_imports = ['numpy', 'future', 'setuptools', 'builtins'] diff --git a/nolds/__init__.py b/nolds/__init__.py index e542d1f..25faddb 100644 --- a/nolds/__init__.py +++ b/nolds/__init__.py @@ -1,6 +1,31 @@ -from .measures import lyap_r, lyap_e, sampen, hurst_rs, corr_dim, dfa, \ - binary_n, logarithmic_n, logarithmic_r, expected_h, logmid_n, expected_rs, \ - lyap_r_len, lyap_e_len, rowwise_chebyshev, rowwise_euclidean, mfhurst_b, \ - mfhurst_dm -from .datasets import brown72, tent_map, logistic_map, fbm, fgn, qrandom, \ - load_qrandom, load_financial, barabasi1991_fractal +from .datasets import ( + barabasi1991_fractal, + brown72, + fbm, + fgn, + load_financial, + load_qrandom, + logistic_map, + qrandom, + tent_map, +) +from .measures import ( + binary_n, + corr_dim, + dfa, + expected_h, + expected_rs, + hurst_rs, + logarithmic_n, + logarithmic_r, + logmid_n, + lyap_e, + lyap_e_len, + lyap_r, + lyap_r_len, + mfhurst_b, + mfhurst_dm, + rowwise_chebyshev, + rowwise_euclidean, + sampen, +) diff --git a/nolds/datasets.py b/nolds/datasets.py index 65489a9..7195d60 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -1,18 +1,11 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) +import datetime + import numpy as np import pkg_resources -import datetime -def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=[1,1,1]): - """ - Simulates the Lorenz system using a simple Euler method +def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=None): + """Simulates the Lorenz system using a simple Euler method. The Lorenz system is a three dimensional dynamical system given by the following equations: @@ -21,6 +14,8 @@ def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=[1,1,1]): dy/dt = rho * x - y - x * z dz/dt = x * y - beta * z """ + if start is None: + start = [1, 1, 1] def lorenz(state, sigma, rho, beta): x, y, z = state # NOTE: Numpy 1.x stores intermediate results as float64 @@ -30,7 +25,7 @@ def lorenz(state, sigma, rho, beta): return np.array([ np.float32(sigma) * (y - x), np.float32(rho) * x - y - x * z, - x * y - np.float32(beta) * z + x * y - np.float32(beta) * z, ], dtype="float32") trajectory = np.zeros((length, 3), dtype="float32") trajectory[0] = start @@ -40,8 +35,7 @@ def lorenz(state, sigma, rho, beta): return trajectory def lorenz_lyap(sigma, rho, beta): - """ - Calculates the exact Lyapunov dimension of the Lorenz system according to + """Calculates the exact Lyapunov dimension of the Lorenz system according to Leonov 2015 [ll_1]_. References: @@ -53,8 +47,7 @@ def lorenz_lyap(sigma, rho, beta): def fbm(n, H=0.75): - """ - Generates fractional brownian motions of desired length. + """Generates fractional brownian motions of desired length. Author: Christian Thomae @@ -74,7 +67,8 @@ def fbm(n, H=0.75): simulated fractional brownian motion """ # TODO more detailed description of fbm - assert H > 0 and H < 1 + assert H > 0 + assert H < 1 def R(t, s): twoH = 2 * H @@ -89,8 +83,7 @@ def R(t, s): def fgn(n, H=0.75): - """ - Generates fractional gaussian noise of desired length. + """Generates fractional gaussian noise of desired length. References: .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion @@ -111,9 +104,8 @@ def fgn(n, H=0.75): def qrandom(n): - """ - Creates an array of n true random numbers obtained from the quantum random - number generator at qrng.anu.edu.au + """Creates an array of n true random numbers obtained from the quantum random + number generator at qrng.anu.edu.au. This function requires the package quantumrandom and an internet connection. @@ -127,14 +119,13 @@ def qrandom(n): """ import quantumrandom return np.concatenate([ - quantumrandom.get_data(data_type='uint16', array_length=1024) + quantumrandom.get_data(data_type="uint16", array_length=1024) for i in range(int(np.ceil(n/1024.0))) ])[:n] def load_qrandom(): - """ - Loads a set of 10000 random numbers generated by qrandom. + """Loads a set of 10000 random numbers generated by qrandom. This dataset can be used when you want to do some limited tests with "true" random data without an internet connection. @@ -149,8 +140,7 @@ def load_qrandom(): def load_brown72(): - """ - Loads the dataset brown72 with a prescribed Hurst exponent of 0.72 + """Loads the dataset brown72 with a prescribed Hurst exponent of 0.72. Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html @@ -164,8 +154,7 @@ def load_brown72(): def load_lorenz_physionet(): - """ - Loads a dataset containing the X variable of the Lorenz system + """Loads a dataset containing the X variable of the Lorenz system as well as the output of PhysioNet's dfa implementation on that dataset. The input data was created with the following code: @@ -177,7 +166,7 @@ def load_lorenz_physionet(): The ouptut from PhysioNet was created by calling: dfa < lorenz.txt > lorenz_physionet.txt - + Returns: 1d float array: time series of the X variable of the Lorenz system that was used as input @@ -194,8 +183,7 @@ def load_lorenz_physionet(): def tent_map(x, steps, mu=2): - """ - Generates a time series of the tent map. + """Generates a time series of the tent map. Characteristics and Background: The name of the tent map is derived from the fact that the plot of x_i vs @@ -248,8 +236,7 @@ def tent_map(x, steps, mu=2): def logistic_map(x, steps, r=4): - r""" - Generates a time series of the logistic map. + r"""Generates a time series of the logistic map. Characteristics and Background: The logistic map is among the simplest examples for a time series that can @@ -340,8 +327,7 @@ def logistic_map(x, steps, r=4): def load_financial(): - """ - Loads the following datasets from CSV files in this package: + """Loads the following datasets from CSV files in this package: - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d @@ -376,7 +362,7 @@ def load_finance_yahoo_data(f): values.append(v) return np.array(days), np.array(values) - def pad_opening_values(values): + def pad_opening_values(values) -> None: # fill first value from future if required first = 0 while np.isnan(values[first, 0]): @@ -392,7 +378,7 @@ def pad_opening_values(values): data = [] for index in ["^JKSE", "^N225", "^NDX"]: - fname = "datasets/{}.csv".format(index) + fname = f"datasets/{index}.csv" with pkg_resources.resource_stream(__name__, fname) as f: days, values = load_finance_yahoo_data(f) pad_opening_values(values) @@ -401,8 +387,7 @@ def pad_opening_values(values): def barabasi1991_fractal(size, iterations, b1=0.8, b2=0.5): - """ - Generates the simple fractal described in [bf]_. + """Generates the simple fractal described in [bf]_. The fractal divides a rectangular segment starting at (x0, y0) with width w and height h along the x axis into four line segments of equal size with the @@ -458,7 +443,7 @@ def b1991(x0, y0, w, h): d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2-1] - fractal[x1]) fractal[x1:x2] = d next_intervals.extend( - [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])] + [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])], ) intervals = next_intervals return fractal diff --git a/nolds/examples.py b/nolds/examples.py index b13801d..1896269 100644 --- a/nolds/examples.py +++ b/nolds/examples.py @@ -1,18 +1,11 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) -from . import measures as nolds -from . import datasets import numpy as np +from . import datasets +from . import measures as nolds + -def weron_2002_figure2(n=10000): - """ - Recreates figure 2 of [w]_ comparing the reported values by Weron to the +def weron_2002_figure2(n=10000) -> None: + """Recreates figure 2 of [w]_ comparing the reported values by Weron to the values obtained by the functions in this package. The experiment consists of n iterations where the hurst exponent of randomly @@ -72,7 +65,7 @@ def height_to_h(height): ]) rs50_raw = np.mean([ nolds.hurst_rs( - np.random.normal(size=l), fit="poly", nvals=nvals, corrected=False + np.random.normal(size=l), fit="poly", nvals=nvals, corrected=False, ) for _ in range(n) ]) @@ -87,9 +80,8 @@ def height_to_h(height): plt.show() -def plot_hurst_hist(): - """ - Plots a histogram of values obtained for the hurst exponent of uniformly +def plot_hurst_hist() -> None: + """Plots a histogram of values obtained for the hurst exponent of uniformly distributed white noise. This function requires the package ``matplotlib``. @@ -106,9 +98,8 @@ def plot_hurst_hist(): plt.show() -def plot_lyap(maptype="logistic"): - """ - Plots a bifurcation plot of the given map and superimposes the true +def plot_lyap(maptype="logistic") -> None: + """Plots a bifurcation plot of the given map and superimposes the true lyapunov exponent as well as the estimates of the largest lyapunov exponent obtained by ``lyap_r`` and ``lyap_e``. The idea for this plot is taken from [ll]_. @@ -164,7 +155,8 @@ def plot_lyap(maptype="logistic"): lambdas = np.log(param_range, where=param_range > 0) lambdas[np.where(param_range <= 0)] = np.nan else: - raise Error("maptype %s not recognized" % maptype) + msg = f"maptype {maptype} not recognized" + raise Error(msg) kwargs_e = {"emb_dim": 6, "matrix_dim": 2} kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} @@ -173,7 +165,7 @@ def plot_lyap(maptype="logistic"): bifur_x = np.repeat(param_range, nbifur) bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) - plt.title("Lyapunov exponent of the %s map" % maptype) + plt.title(f"Lyapunov exponent of the {maptype} map") plt.plot(param_range, lambdas, "b-", label="true lyap. exponent") elab = "estimation using lyap_e" rlab = "estimation using lyap_r" @@ -183,27 +175,25 @@ def plot_lyap(maptype="logistic"): plt.plot(bifur_x, bifur, "ro", alpha=0.1, label="bifurcation plot") plt.ylim((-2, 2)) plt.xlabel(param_name) - plt.ylabel("lyap. exp / %s(x, %s)" % (maptype, param_name)) + plt.ylabel(f"lyap. exp / {maptype}(x, {param_name})") plt.legend(loc="best") plt.show() -def profiling(): - """ - Runs a profiling test for the function ``lyap_e`` (mainly used for - development) +def profiling() -> None: + """Runs a profiling test for the function ``lyap_e`` (mainly used for + development). This function requires the package ``cProfile``. """ import cProfile n = 10000 data = np.cumsum(np.random.random(n) - 0.5) - cProfile.runctx('lyap_e(data)', {'lyap_e': nolds.lyap_e}, {'data': data}) + cProfile.runctx("lyap_e(data)", {"lyap_e": nolds.lyap_e}, {"data": data}) -def hurst_compare_nvals(data, nvals=None): - """ - Creates a plot that compares the results of different choices for nvals +def hurst_compare_nvals(data, nvals=None) -> None: + """Creates a plot that compares the results of different choices for nvals for the function hurst_rs. Args: @@ -240,16 +230,16 @@ def corr(nvals): plt.xlabel("log(n)") plt.ylabel("log((R/S)_n - E[(R/S)_n])") plt.legend( - l_all + l_def + l_div + l_cst, ["all", "default", "divisors"] + t_cst + l_all + l_def + l_div + l_cst, ["all", "default", "divisors", *t_cst], ) labeled_data = zip([dd_all[0], dd_def[0], dd_div[0]], ["all", "def", "div"]) - for data, label in labeled_data: - print("%s: %.3f" % (label, data)) + for data, _label in labeled_data: + pass if nvals is not None: - print("custom: %.3f" % dd_cst[0]) + pass plt.show() -def sampen_default_tolerance(): +def sampen_default_tolerance() -> None: data = list(datasets.logistic_map(0.34, 1000, r=3.9)) oldtol = 0.2 * np.std(data, ddof=1) old_res = [ @@ -260,14 +250,11 @@ def sampen_default_tolerance(): nolds.sampen(data, emb_dim=i) for i in range(1, 30) ] - for i, old, new in zip(range(1, 30), old_res, new_res): - print("emb_dim={} old={:.3f} corrected={:.3f}".format(i, old, new)) - print(" old variance: {:.3f}".format(np.var(old_res))) - print("corrected variance: {:.3f}".format(np.var(new_res))) + for _i, _old, _new in zip(range(1, 30), old_res, new_res): + pass -def aste_line_fitting(N=100): - """ - Shows plot that proves that the line fitting in T. Astes original MATLAB code +def aste_line_fitting(N=100) -> None: + """Shows plot that proves that the line fitting in T. Astes original MATLAB code provides the same results as `np.polyfit`. """ slope = np.random.random() * 10 - 5 @@ -278,25 +265,24 @@ def aste_line_fitting(N=100): plt.plot(xvals, yvals, "rx", label="data") plt.plot( [0, N-1], [intercept, intercept + slope * (N-1)], - "r-", label="true ({:.3f} x + {:.3f})".format(slope, intercept), alpha=0.5 + "r-", label=f"true ({slope:.3f} x + {intercept:.3f})", alpha=0.5, ) i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) s_np, i_np = np.polyfit(xvals, yvals, 1) plt.plot( [0, N-1], [i_aste, i_aste + s_aste * (N-1)], - "b-", label="aste ({:.3f} x + {:.3f})".format(s_aste, i_aste), alpha=0.5 + "b-", label=f"aste ({s_aste:.3f} x + {i_aste:.3f})", alpha=0.5, ) plt.plot( [0, N-1], [i_np, i_np + s_np * (N-1)], - "g-", label="numpy ({:.3f} x + {:.3f})".format(s_np, i_np), alpha=0.5 + "g-", label=f"numpy ({s_np:.3f} x + {i_np:.3f})", alpha=0.5, ) plt.legend() plt.show() -def hurst_mf_stock(debug=False): - """ - Recreates results from [mfs_1]_ (table at start of section 4) as print +def hurst_mf_stock(debug=False) -> None: + """Recreates results from [mfs_1]_ (table at start of section 4) as print output. Unfortunately as a layman in finance, I could not determine the exact data @@ -331,27 +317,24 @@ def hurst_mf_stock(debug=False): if `True`, a debug plot will be shown for each calculated GHE value except for the ones generated by `_genhurst`. """ - print("Dataset mfhurst_b mfhurst_b + dt mfhurst_dm _genhurst") financial = [ - (datasets.jkse, "jkse"), (datasets.n225, "n225"), (datasets.ndx, "ndx") + (datasets.jkse, "jkse"), (datasets.n225, "n225"), (datasets.ndx, "ndx"), ] - for data, lab in financial: + for data, _lab in financial: data = data[1][:, 0] data = np.log(data) dists = range(1, 20) - mfh_b = nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] - mfh_b_dt = nolds.mfhurst_b( + nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] + nolds.mfhurst_b( nolds.detrend_data(data, order=1), - qvals=[2], dists=dists, debug_plot=debug + qvals=[2], dists=dists, debug_plot=debug, )[0] - mfh_dm = nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] - gh = nolds._genhurst(data, 2) - print("{:10s} {:5.3f} {:5.3f} {:5.3f} {:5.3f}".format(lab, mfh_b, mfh_b_dt, mfh_dm, gh)) + nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] + nolds._genhurst(data, 2) -def barabasi_1991_figure2(): - """ - Recreates figure 2 from [bf2]_. +def barabasi_1991_figure2() -> None: + """Recreates figure 2 from [bf2]_. This figure compares calculated and estimated values for H(q) for a fractal generated by 9 iterations of the `barabasi1991_fractal` function @@ -379,9 +362,8 @@ def barabasi_1991_figure2(): plt.show() -def barabasi_1991_figure3(): - """ - Recreates figure 3 from [bf3]_. +def barabasi_1991_figure3() -> None: + """Recreates figure 3 from [bf3]_. This figure compares calculated and estimated values for H(q) for a simple Brownian motion that moves in unit steps (-1 or +1) in each time step. @@ -406,9 +388,8 @@ def barabasi_1991_figure3(): plt.show() -def lorenz(): - """ - Calculates different measures for the Lorenz system of ordinary +def lorenz() -> None: + """Calculates different measures for the Lorenz system of ordinary differential equations and compares nolds results with prescribed results from the literature. @@ -464,7 +445,6 @@ def lorenz(): """ - import matplotlib.pyplot as plt sigma = 10 rho = 28 beta = 8.0/3 @@ -480,33 +460,25 @@ def lorenz(): # plt.show() # plt.close(fig) - lyap_expected = datasets.lorenz_lyap(sigma, rho, beta) + datasets.lorenz_lyap(sigma, rho, beta) # Rationale for argument values: # start with medium settings for min_tsep and lag, span a large area with trajectory_len, set fit_offset to 0 # up the embedding dimension until you get a clear line in the debug plot # adjust trajectory_len and fit_offset to split off only the linear part # in general: the longer the linear part of the plot, the better - lyap_r_args = dict(min_tsep=10, emb_dim=5, tau=dt, lag=5, trajectory_len=28, fit_offset=8, fit="poly") - lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) - lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) - lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) + lyap_r_args = {"min_tsep": 10, "emb_dim": 5, "tau": dt, "lag": 5, "trajectory_len": 28, "fit_offset": 8, "fit": "poly"} + nolds.lyap_r(data[:, 0], **lyap_r_args) + nolds.lyap_r(data[:, 1], **lyap_r_args) + nolds.lyap_r(data[:, 2], **lyap_r_args) # Rationale for argument values: # Start with emb_dim=matrix_dim, medium min_tsep and min_nb # After that, no good guidelines for stability. :( # -> Just experiment with settings until you get close to expected value. ¯\_(ツ)_/¯ # NOTE: It seems from this example and `lyapunov-logistic` that lyap_e has a scaling problem. - lyap_e_args = dict(min_tsep=10, emb_dim=5, matrix_dim=5, tau=dt, min_nb=8) - lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) - lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) - lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) - print("Expected Lyapunov exponent: ", lyap_expected) - print("lyap_r(x) : ", lyap_rx) - print("lyap_r(y) : ", lyap_ry) - print("lyap_r(z) : ", lyap_rz) - print("lyap_e(x) : ", lyap_ex) - print("lyap_e(y) : ", lyap_ey) - print("lyap_e(z) : ", lyap_ez) - print() + lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": dt, "min_nb": 8} + nolds.lyap_e(data[:, 0], **lyap_e_args) + nolds.lyap_e(data[:, 1], **lyap_e_args) + nolds.lyap_e(data[:, 2], **lyap_e_args) # Rationale for argument values: # Start with moderate settings for lag and a large span of rvals. @@ -515,31 +487,21 @@ def lorenz(): # Increase lag as long as it increases the output. Stop when the output becomes smaller # (or when you feel that the lag is unreasonably large.) rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - corr_dim_args = dict(emb_dim=5, lag=10, fit="poly", rvals=rvals) - cdx = nolds.corr_dim(data[:, 0], **corr_dim_args) - cdy = nolds.corr_dim(data[:, 1], **corr_dim_args) - cdz = nolds.corr_dim(data[:, 2], **corr_dim_args) + corr_dim_args = {"emb_dim": 5, "lag": 10, "fit": "poly", "rvals": rvals} + nolds.corr_dim(data[:, 0], **corr_dim_args) + nolds.corr_dim(data[:, 1], **corr_dim_args) + nolds.corr_dim(data[:, 2], **corr_dim_args) # reference Grassberger-Procaccia 1983 - print("Expected correlation dimension: 2.05") - print("corr_dim(x) : ", cdx) - print("corr_dim(y) : ", cdy) - print("corr_dim(z) : ", cdz) - print() # Rationale for argument values: # Start with a large range of nvals. # Reduce those down cutting of the first few data points and then only keep the # linear-ish looking part of the initial rise. - hurst_rs_args = dict(fit="poly", nvals=nolds.logarithmic_n(10, 70, 1.1)) - hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) - hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) - hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) + hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} + nolds.hurst_rs(data[:, 0], **hurst_rs_args) + nolds.hurst_rs(data[:, 1], **hurst_rs_args) + nolds.hurst_rs(data[:, 2], **hurst_rs_args) # reference: Suyal 2009 - print("Expected hurst exponent: 0.64 < H < 0.93") - print("hurst_rs(x) : ", hx) - print("hurst_rs(y) : ", hy) - print("hurst_rs(z) : ", hz) - print() # reference: Wallot 2023, Table 1 # Rationale for argument values: Just follow paper @@ -548,26 +510,17 @@ def lorenz(): # don't report step size, we use different data here data_dfa = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] nvals = nolds.logarithmic_n(200, len(data_dfa)/8, 2**0.2) - dfa_args = dict(nvals=nvals, order=2, overlap=False, fit_exp="poly") - dx = nolds.dfa(data_dfa[:, 0], **dfa_args) - dy = nolds.dfa(data_dfa[:, 1], **dfa_args) - dz = nolds.dfa(data_dfa[:, 2], **dfa_args) - print("Expected hurst parameter: [1.008 ±0.016, 0.926 ±0.016, 0.650 ±0.22]") - print("dfa(x) : ", dx) - print("dfa(y) : ", dy) - print("dfa(z) : ", dz) - print() + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} + nolds.dfa(data_dfa[:, 0], **dfa_args) + nolds.dfa(data_dfa[:, 1], **dfa_args) + nolds.dfa(data_dfa[:, 2], **dfa_args) # reference: Kaffashi 2008 # Rationale for argument values: Just follow paper. - sampen_args = dict(emb_dim=2, lag=1) - sx = nolds.sampen(data[:, 0], **sampen_args) - sy = nolds.sampen(data[:, 1], **sampen_args) - sz = nolds.sampen(data[:, 2], **sampen_args) - print("Expected sample entropy: [0.15, 0.15, 0.25]") - print("sampen(x): ", sx) - print("sampen(y): ", sy) - print("sampen(z): ", sz) + sampen_args = {"emb_dim": 2, "lag": 1} + nolds.sampen(data[:, 0], **sampen_args) + nolds.sampen(data[:, 1], **sampen_args) + nolds.sampen(data[:, 2], **sampen_args) if __name__ == "__main__": @@ -575,20 +528,9 @@ def lorenz(): # python -m nolds.examples lyapunov-logistic import sys - def print_options(): - print("options are:") - print(" lyapunov-logistic") - print(" lyapunov-tent") - print(" profiling") - print(" hurst-weron2") - print(" hurst-hist") - print(" hurst-nvals") - print(" sampen-tol") - print(" aste-line") - print(" hurst-mf-stock") - print(" lorenz") + def print_options() -> None: + pass if len(sys.argv) < 2: - print("please tell me which tests you want to run") print_options() elif sys.argv[1] == "lyapunov-logistic": plot_lyap() @@ -616,5 +558,4 @@ def print_options(): elif sys.argv[1] == "lorenz": lorenz() else: - print("i do not know any test of that name") print_options() diff --git a/nolds/measures.py b/nolds/measures.py index 7fe4d1b..52ce1f7 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -1,13 +1,7 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) -import numpy as np -import warnings import math +import warnings + +import numpy as np def rowwise_chebyshev(x, y): @@ -30,13 +24,13 @@ def poly_fit(x, y, degree, fit="RANSAC"): except ImportError: warnings.warn( "fitting mode 'RANSAC' requires the package sklearn, using" - + " 'poly' instead", - RuntimeWarning) + " 'poly' instead", + RuntimeWarning, stacklevel=2) fit = "poly" if fit == "poly": return np.polyfit(x, y, degree) - elif fit == "RANSAC": + if fit == "RANSAC": model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) xdat = np.asarray(x) if len(xdat.shape) == 1: @@ -50,17 +44,16 @@ def poly_fit(x, y, degree, fit="RANSAC"): except ValueError: warnings.warn( "RANSAC did not reach consensus, " - + "using numpy's polyfit", - RuntimeWarning) + "using numpy's polyfit", + RuntimeWarning, stacklevel=2) coef = np.polyfit(x, y, degree) return coef - else: - raise ValueError("invalid fitting mode ({})".format(fit)) + msg = f"invalid fitting mode ({fit})" + raise ValueError(msg) def delay_embedding(data, emb_dim, lag=1): - """ - Perform a time-delay embedding of a time series + """Perform a time-delay embedding of a time series. Args: data (array-like): @@ -81,7 +74,7 @@ def delay_embedding(data, emb_dim, lag=1): min_len = (emb_dim - 1) * lag + 1 if len(data) < min_len: msg = "cannot embed data of length {} with embedding dimension {} " \ - + "and lag {}, minimum required length is {}" + "and lag {}, minimum required length is {}" raise ValueError(msg.format(len(data), emb_dim, lag, min_len)) m = len(data) - min_len + 1 indices = np.repeat([np.arange(emb_dim) * lag], m, axis=0) @@ -90,8 +83,7 @@ def delay_embedding(data, emb_dim, lag=1): def lyap_r_len(**kwargs): - """ - Helper function that calculates the minimum number of data points required + """Helper function that calculates the minimum number of data points required to use lyap_r. Note that none of the required parameters may be set to None. @@ -106,19 +98,18 @@ def lyap_r_len(**kwargs): parameters """ # minimum length required to find single orbit vector - min_len = (kwargs['emb_dim'] - 1) * kwargs['lag'] + 1 + min_len = (kwargs["emb_dim"] - 1) * kwargs["lag"] + 1 # we need trajectory_len orbit vectors to follow a complete trajectory - min_len += kwargs['trajectory_len'] - 1 + min_len += kwargs["trajectory_len"] - 1 # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs['min_tsep'] * 2 + 1 + min_len += kwargs["min_tsep"] * 2 + 1 return min_len def lyap_r(data, emb_dim=10, lag=None, min_tsep=None, tau=1, min_neighbors=20, trajectory_len=20, fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None, fit_offset=0): - """ - Estimates the largest Lyapunov exponent using the algorithm of Rosenstein + """Estimates the largest Lyapunov exponent using the algorithm of Rosenstein et al. [lr_1]_. Explanation of Lyapunov exponents: @@ -260,7 +251,7 @@ def lyap_r(data, emb_dim=10, lag=None, min_tsep=None, tau=1, min_neighbors=20, if min_tsep > max_tsep_factor * n: min_tsep = int(max_tsep_factor * n) msg = "signal has very low mean frequency, setting min_tsep = {:d}" - warnings.warn(msg.format(min_tsep), RuntimeWarning) + warnings.warn(msg.format(min_tsep), RuntimeWarning, stacklevel=2) if lag is None: # calculate the lag as point where the autocorrelation drops to (1 - 1/e) # times its maximum value @@ -278,7 +269,7 @@ def lyap_r(data, emb_dim=10, lag=None, min_tsep=None, tau=1, min_neighbors=20, def nb_neighbors(lag_value): min_len = lyap_r_len( emb_dim=emb_dim, lag=lag_value, trajectory_len=trajectory_len, - min_tsep=min_tsep + min_tsep=min_tsep, ) return max(0, n - min_len) # find lag @@ -288,19 +279,19 @@ def nb_neighbors(lag_value): break if nb_neighbors(i) < min_neighbors: msg = "autocorrelation declined too slowly to find suitable lag" \ - + ", setting lag to {}" - warnings.warn(msg.format(lag), RuntimeWarning) + ", setting lag to {}" + warnings.warn(msg.format(lag), RuntimeWarning, stacklevel=2) break min_len = lyap_r_len( emb_dim=emb_dim, lag=lag, trajectory_len=trajectory_len, - min_tsep=min_tsep + min_tsep=min_tsep, ) if len(data) < min_len: msg = "for emb_dim = {}, lag = {}, min_tsep = {} and trajectory_len = {}" \ - + " you need at least {} datapoints in your time series" + " you need at least {} datapoints in your time series" warnings.warn( msg.format(emb_dim, lag, min_tsep, trajectory_len, min_len), - RuntimeWarning + RuntimeWarning, stacklevel=2, ) # delay embedding orbit = delay_embedding(data, emb_dim, lag) @@ -318,14 +309,14 @@ def nb_neighbors(lag_value): min_traj = min_tsep * 2 + 2 # in each row min_tsep + 1 disances are inf if ntraj <= 0: msg = "Not enough data points. Need {} additional data points to follow " \ - + "a complete trajectory." + "a complete trajectory." raise ValueError(msg.format(-ntraj+1)) if ntraj < min_traj: # not enough data points => there are rows where all values are inf assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)) msg = "Not enough data points. At least {} trajectories are required " \ - + "to find a valid neighbor for each orbit vector with min_tsep={} " \ - + "but only {} could be created." + "to find a valid neighbor for each orbit vector with min_tsep={} " \ + "but only {} could be created." raise ValueError(msg.format(min_traj, min_tsep, ntraj)) assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)) # find nearest neighbors (exclude last columns, because these vectors cannot @@ -365,13 +356,11 @@ def nb_neighbors(lag_value): le = poly[0] / tau if debug_data: return (le, (ks, div_traj, poly)) - else: - return le + return le def lyap_e_len(**kwargs): - """ - Helper function that calculates the minimum number of data points required + """Helper function that calculates the minimum number of data points required to use lyap_e. Note that none of the required parameters may be set to None. @@ -385,22 +374,21 @@ def lyap_e_len(**kwargs): minimum number of data points required to call lyap_e with the given parameters """ - m = (kwargs['emb_dim'] - 1) // (kwargs['matrix_dim'] - 1) + m = (kwargs["emb_dim"] - 1) // (kwargs["matrix_dim"] - 1) # minimum length required to find single orbit vector - min_len = kwargs['emb_dim'] + min_len = kwargs["emb_dim"] # we need to follow each starting point of an orbit vector for m more steps min_len += m # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs['min_tsep'] * 2 + min_len += kwargs["min_tsep"] * 2 # we need at least min_nb neighbors for each orbit vector - min_len += kwargs['min_nb'] + min_len += kwargs["min_nb"] return min_len def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, debug_plot=False, debug_data=False, plot_file=None): - """ - Estimates the Lyapunov exponents for the given data using the algorithm of + r"""Estimates the Lyapunov exponents for the given data using the algorithm of Eckmann et al. [le_1]_. Recommendations for parameter settings by Eckmann et al.: @@ -506,22 +494,23 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, data = np.asarray(data, dtype=np.float64) n = len(data) if (emb_dim - 1) % (matrix_dim - 1) != 0: - raise ValueError("emb_dim - 1 must be divisible by matrix_dim - 1!") + msg = "emb_dim - 1 must be divisible by matrix_dim - 1!" + raise ValueError(msg) m = (emb_dim - 1) // (matrix_dim - 1) if min_nb is None: # minimal number of neighbors as suggested by Eckmann et al. min_nb = min(2 * matrix_dim, matrix_dim + 4) min_len = lyap_e_len( - emb_dim=emb_dim, matrix_dim=matrix_dim, min_nb=min_nb, min_tsep=min_tsep + emb_dim=emb_dim, matrix_dim=matrix_dim, min_nb=min_nb, min_tsep=min_tsep, ) if n < min_len: msg = "{} data points are not enough! For emb_dim = {}, matrix_dim = {}" \ - + ", min_tsep = {} and min_nb = {} you need at least {} data points " \ - + "in your time series" + ", min_tsep = {} and min_nb = {} you need at least {} data points " \ + "in your time series" warnings.warn( msg.format(n, emb_dim, matrix_dim, min_tsep, min_nb, min_len), - RuntimeWarning + RuntimeWarning, stacklevel=2, ) # construct orbit as matrix (e = emb_dim) @@ -536,7 +525,7 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, if len(orbit) < min_nb: assert len(data) < min_len msg = "Not enough data points. Need at least {} additional data points " \ - + "to have min_nb = {} neighbor candidates" + "to have min_nb = {} neighbor candidates" raise ValueError(msg.format(min_nb-len(orbit), min_nb)) old_Q = np.identity(matrix_dim) lexp = np.zeros(matrix_dim, dtype=np.float64) @@ -548,7 +537,7 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, # find neighbors for each vector in the orbit using the chebyshev distance diffs = rowwise_chebyshev(orbit, orbit[i]) # ensure that we do not count the difference of the vector to itself - diffs[i] = float('inf') + diffs[i] = float("inf") # mask all neighbors that are too close in time to the vector itself mask_from = max(0, i - min_tsep) mask_to = min(len(diffs), i + min_tsep + 1) @@ -559,8 +548,8 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, if np.isinf(r): assert len(data) < min_len msg = "Not enough data points. Orbit vector {} has less than min_nb = " \ - + "{} valid neighbors that are at least min_tsep = {} time steps " \ - + "away. Input must have at least length {}." + "{} valid neighbors that are at least min_tsep = {} time steps " \ + "away. Input must have at least length {}." raise ValueError(msg.format(i, min_nb, min_tsep, min_len)) # there may be more than min_nb vectors at distance r (if multiple vectors # have a distance of exactly r) @@ -608,8 +597,8 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, if max(np.max(indices), i) + matrix_dim * m >= len(data): assert len(data) < min_len msg = "Not enough data points. Cannot follow orbit vector {} for " \ - + "{} (matrix_dim * m) time steps. Input must have at least " \ - + "length {}." + "{} (matrix_dim * m) time steps. Input must have at least " \ + "length {}." raise ValueError(msg.format(i, matrix_dim * m, min_len)) vec_beta = data[indices + matrix_dim * m] - data[i + matrix_dim * m] @@ -665,7 +654,7 @@ def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, return lexp -def plot_dists(dists, tolerance, m, title=None, fname=None): +def plot_dists(dists, tolerance, m, title=None, fname=None) -> None: # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt nstd = 3 @@ -679,7 +668,7 @@ def plot_dists(dists, tolerance, m, title=None, fname=None): colors = ["green", "blue"] for h, bins in [np.histogram(dat, nbins, rng) for dat in dists]: bw = bins[1] - bins[0] - plt.bar(bins[:-1], h, bw, label="m={:d}".format(m + i), + plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) i += 1 plt.axvline(tolerance, color="red") @@ -698,8 +687,7 @@ def plot_dists(dists, tolerance, m, title=None, fname=None): def sampen(data, emb_dim=2, tolerance=None, lag=1, dist=rowwise_chebyshev, closed=False, debug_plot=False, debug_data=False, plot_file=None): - """ - Computes the sample entropy of the given data. + """Computes the sample entropy of the given data. Explanation of the sample entropy: The sample entropy of a time series is defined as the negative natural @@ -828,10 +816,10 @@ def sampen(data, emb_dim=2, tolerance=None, lag=1, dist=rowwise_chebyshev, zcounts.append("emb_dim + 1") warnings.warn( ( - "Zero vectors are within tolerance for %s. " \ - + "Consider raising the tolerance parameter to avoid %s result." - ) % (" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), - RuntimeWarning + "Zero vectors are within tolerance for {}. " \ + "Consider raising the tolerance parameter to avoid {} result." + ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), + RuntimeWarning, stacklevel=2, ) if counts[0] == 0 and counts[1] == 0: saen = np.nan @@ -840,17 +828,15 @@ def sampen(data, emb_dim=2, tolerance=None, lag=1, dist=rowwise_chebyshev, else: saen = np.inf if debug_plot: - plot_dists(plot_data, tolerance, m, title="sampEn = {:.3f}".format(saen), + plot_dists(plot_data, tolerance, m, title=f"sampEn = {saen:.3f}", fname=plot_file) if debug_data: return (saen, counts, plot_data) - else: - return saen + return saen def binary_n(total_N, min_n=50): - """ - Creates a list of values by successively halving the total length total_N + """Creates a list of values by successively halving the total length total_N until the resulting value is less than min_n. Non-integer results are rounded down. @@ -872,8 +858,7 @@ def binary_n(total_N, min_n=50): def logarithmic_n(min_n, max_n, factor): - """ - Creates a list of values by successively multiplying a minimum value min_n by + """Creates a list of values by successively multiplying a minimum value min_n by a factor > 1 until a maximum value max_n is reached. Non-integer results are rounded down. @@ -906,8 +891,7 @@ def logarithmic_n(min_n, max_n, factor): def logmid_n(max_n, ratio=1/4.0, nsteps=15): - """ - Creates an array of integers that lie evenly spaced in the "middle" of the + """Creates an array of integers that lie evenly spaced in the "middle" of the logarithmic scale from 0 to log(max_n). If max_n is very small and/or nsteps is very large, this may lead to @@ -944,8 +928,7 @@ def logmid_n(max_n, ratio=1/4.0, nsteps=15): def logarithmic_r(min_n, max_n, factor): - """ - Creates a list of values by successively multiplying a minimum value min_n by + """Creates a list of values by successively multiplying a minimum value min_n by a factor > 1 until a maximum value max_n is reached. Args: @@ -967,8 +950,7 @@ def logarithmic_r(min_n, max_n, factor): def expected_rs(n): - """ - Calculates the expected (R/S)_n for white noise for a given n. + """Calculates the expected (R/S)_n for white noise for a given n. This is used as a correction factor in the function hurst_rs. It uses the formula of Anis-Lloyd-Peters (see [h_3]_). @@ -992,8 +974,7 @@ def expected_rs(n): def expected_h(nvals, fit="RANSAC"): - """ - Uses expected_rs to calculate the expected value for the Hurst exponent h + """Uses expected_rs to calculate the expected value for the Hurst exponent h based on the values of n used for the calculation. Args: @@ -1016,8 +997,7 @@ def expected_h(nvals, fit="RANSAC"): def rs(data, n, unbiased=True): - """ - Calculates an individual R/S value in the rescaled range approach for + """Calculates an individual R/S value in the rescaled range approach for a given n. Note: This is just a helper function for hurst_rs and should not be called @@ -1065,12 +1045,11 @@ def rs(data, n, unbiased=True): # it may happen that all ranges are zero (if all values in data are equal) if len(r) == 0: return np.nan - else: - # return mean of r/s along subsequence index - return np.mean(r / s) + # return mean of r/s along subsequence index + return np.mean(r / s) -def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None): +def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None) -> None: # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt nhists = len(data[0]) @@ -1094,7 +1073,7 @@ def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None): plt.bar(bins[:-1], h, bin_width) plt.axvline(np.mean(data[:, i]), color="red") plt.ylim(ylim) - plt.title("{:s}[{:d}]".format(name, i)) + plt.title(f"{name:s}[{i:d}]") if fname is None: plt.show() else: @@ -1103,9 +1082,8 @@ def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None): def plot_reg(xvals, yvals, poly, x_label="x", y_label="y", data_label="data", - reg_label="regression line", fname=None): - """ - Helper function to plot trend lines for line-fitting approaches. This + reg_label="regression line", fname=None) -> None: + """Helper function to plot trend lines for line-fitting approaches. This function will show a plot through ``plt.show()`` and close it after the window has been closed by the user. @@ -1132,7 +1110,7 @@ def plot_reg(xvals, yvals, poly, x_label="x", y_label="y", data_label="data", # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt plt.plot(xvals, yvals, "bo", label=data_label) - if not (poly is None): + if poly is not None: plt.plot(xvals, np.polyval(poly, xvals), "r-", label=reg_label) plt.xlabel(x_label) plt.ylabel(y_label) @@ -1146,10 +1124,8 @@ def plot_reg(xvals, yvals, poly, x_label="x", y_label="y", data_label="data", def plot_reg_tiled(xvals, yvals, polys, x_label="x", y_label="y", data_labels=None, reg_labels=None, fname=None, - columns=None): - """ - TODO - """ + columns=None) -> None: + """TODO.""" # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt max_span = max([np.max(y) - np.min(y) for y in yvals]) @@ -1163,7 +1139,7 @@ def plot_reg_tiled(xvals, yvals, polys, x_label="x", y_label="y", for i in range(len(xvals)): plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) - if not (polys is None): + if polys is not None: plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) plt.xlabel(x_label) plt.ylabel(y_label) @@ -1177,10 +1153,8 @@ def plot_reg_tiled(xvals, yvals, polys, x_label="x", y_label="y", def plot_reg_multiple(xvals, yvals, polys, x_label="x", y_label="y", - data_labels=None, reg_labels=None, fname=None): - """ - TODO - """ + data_labels=None, reg_labels=None, fname=None) -> None: + """TODO.""" import matplotlib.pyplot as plt if data_labels is None: data_labels = ["data"] * len(xvals) @@ -1188,7 +1162,7 @@ def plot_reg_multiple(xvals, yvals, polys, x_label="x", y_label="y", reg_labels = ["regression line"] * len(xvals) for i in range(len(xvals)): plt.plot(xvals[i], yvals[i], "+", label=data_labels[i]) - if not (polys is None): + if polys is not None: plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), label=reg_labels[i]) plt.xlabel(x_label) plt.ylabel(y_label) @@ -1202,8 +1176,7 @@ def plot_reg_multiple(xvals, yvals, polys, x_label="x", y_label="y", def hurst_rs(data, nvals=None, fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None, corrected=True, unbiased=True): - """ - Calculates the Hurst exponent by a standard rescaled range (R/S) approach. + """Calculates the Hurst exponent by a standard rescaled range (R/S) approach. Explanation of Hurst exponent: The Hurst exponent is a measure for the "long-term memory" of a @@ -1373,7 +1346,7 @@ def hurst_rs(data, nvals=None, fit="RANSAC", debug_plot=False, if debug_plot: warnings.warn( "Cannot display debug plot, all (R/S)_n are NaN", - RuntimeWarning + RuntimeWarning, stacklevel=2, ) else: # fit a line to the logarithm of the obtained (R/S)_n @@ -1390,17 +1363,15 @@ def hurst_rs(data, nvals=None, fit="RANSAC", debug_plot=False, # return line slope (+ correction) as hurst exponent if debug_data: return (h, (np.log(nvals), np.log(rsvals), poly)) - else: - return h + return h # TODO implement MFDFA as second (more reliable) measure for multifractality # NOTE: probably not needed, since mfhurst_b is already pretty reliable -def mfhurst_b(data, qvals=None, dists=None, fit='poly', +def mfhurst_b(data, qvals=None, dists=None, fit="poly", debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the Generalized Hurst Exponent H_q for different q according to + r"""Calculates the Generalized Hurst Exponent H_q for different q according to A.-L. Barabási and T. Vicsek. Explanation of the Generalized Hurst Exponent: @@ -1523,7 +1494,7 @@ def mfhurst_b(data, qvals=None, dists=None, fit='poly', dists = np.asarray(dists) if len(data) < 60: warnings.warn( - "H(q) is not reliable for small time series ({} < 60)".format(len(data)) + f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, ) def hhcorr(d, q): @@ -1551,18 +1522,16 @@ def hhcorr(d, q): [p / q for p, q in zip(polys, qvals)], x_label="log(x)", y_label="$\\log(c_q(x)) / q$", data_labels=["q = %d" % q for q in qvals], - reg_labels=["reg. line (H = {:.3f})".format(h) for h in H], - fname=plot_file + reg_labels=[f"reg. line (H = {h:.3f})" for h in H], + fname=plot_file, ) if debug_data: return H, (xvals, yvals, polys) - else: - return H + return H def _genhurst(S, q): - """ - Computes the generalized hurst exponent H_q for time series S. + """Computes the generalized hurst exponent H_q for time series S. This function should not be used. It is only kept here to demonstrate that ``mfhurst_dm`` is implemented correctly. You can use the following call to @@ -1607,7 +1576,7 @@ def _genhurst(S, q): """ L = len(S) if L < 100: - warnings.warn('Data series very short!') + warnings.warn("Data series very short!", stacklevel=2) H = np.zeros((len(range(5, 20)), 1)) k = 0 @@ -1637,20 +1606,18 @@ def _genhurst(S, q): my = np.mean(np.log10(mcord)) SSxy = np.sum( np.multiply( - np.log10(x), np.transpose(np.log10(mcord)) - ) + np.log10(x), np.transpose(np.log10(mcord)), + ), ) - Tmax*mx*my H[k] = SSxy/SSxx k = k + 1 - mH = np.mean(H)/q + return np.mean(H)/q - return mH def _aste_line_fit(x, y): - """ - Simple linear regression with ordinary least squares - https://en.wikipedia.org/wiki/Simple_linear_regression + """Simple linear regression with ordinary least squares + https://en.wikipedia.org/wiki/Simple_linear_regression. NOTE: this function is left here to demonstrate the correctness of T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same @@ -1694,8 +1661,7 @@ def _aste_line_fit(x, y): def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True, fit="poly", debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the Generalized Hurst Exponent H_q for different q according to + """Calculates the Generalized Hurst Exponent H_q for different q according to the MATLAB code of Tomaso Aste - one of the authors that introduced this measure. @@ -1791,7 +1757,7 @@ def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True, qvals = [1] if len(data) < 60: warnings.warn( - "H(q) is not reliable for small time series ({} < 60)".format(len(data)) + f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, ) max_max_dist = np.max(max_dists) hhcorr = [] @@ -1834,21 +1800,19 @@ def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True, polys, x_label="log(x)", y_label="$\\log(c_q(x)) / q$", data_labels=["q = %d" % q for q in qvals], - reg_labels=["reg. line (H = {:.3f})".format(h) for h in H[:, -1] / qvals], - fname=plot_file + reg_labels=[f"reg. line (H = {h:.3f})" for h in H[:, -1] / qvals], + fname=plot_file, ) mH = np.mean(H, axis=1) / qvals sH = np.std(H, axis=1) / qvals if debug_data: return [mH, sH, (xvals, yvals, polys)] - else: - return [mH, sH] + return [mH, sH] def corr_dim(data, emb_dim, lag=1, rvals=None, dist=rowwise_euclidean, fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """ - Calculates the correlation dimension with the Grassberger-Procaccia algorithm + """Calculates the correlation dimension with the Grassberger-Procaccia algorithm. Explanation of correlation dimension: The correlation dimension is a characteristic measure that can be used @@ -1979,25 +1943,20 @@ def corr_dim(data, emb_dim, lag=1, rvals=None, dist=rowwise_euclidean, fname=plot_file) if debug_data: return (poly[0], (np.log(rvals), np.log(csums), poly)) - else: - return poly[0] + return poly[0] def detrend_data(data, order=1, fit="poly"): - """ - Removes a trend of given order from the data. - """ + """Removes a trend of given order from the data.""" # TODO also use this function in dfa xvals = np.arange(len(data)) trend = poly_fit(xvals, data, order, fit=fit) - detrended = data - np.polyval(trend, xvals) - return detrended + return data - np.polyval(trend, xvals) def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """ - Performs a detrended fluctuation analysis (DFA) on the given data + """Performs a detrended fluctuation analysis (DFA) on the given data. Recommendations for parameter settings by Hardstone et al.: * nvals should be equally spaced on a logarithmic scale so that each window @@ -2036,7 +1995,7 @@ def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", in this sense are smooth signals with monotonous or slowly oscillating behavior that are caused by external effects and not the dynamical system under study. - + To get a hold of these trends, the first step is to calculate the "profile" of our time series as the cumulative sum of deviations from the mean, effectively integrating our data. This both smoothes out measurement noise @@ -2052,7 +2011,7 @@ def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", size has the form W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] - + The local trends are then removed for each window separately by fitting a polynomial p_(n,i) to the window W_(n,i) and then calculating W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). @@ -2166,14 +2125,17 @@ def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", else: nvals = [total_N-2, total_N-1] msg = "choosing nvals = {} , DFA with less than ten data points is " \ - + "extremely unreliable" - warnings.warn(msg.format(nvals), RuntimeWarning) + "extremely unreliable" + warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) if len(nvals) < 2: - raise ValueError("at least two nvals are needed") + msg = "at least two nvals are needed" + raise ValueError(msg) if np.min(nvals) < 2: - raise ValueError("nvals must be at least two") + msg = "nvals must be at least two" + raise ValueError(msg) if np.max(nvals) >= total_N: - raise ValueError("nvals cannot be larger than the input size") + msg = "nvals cannot be larger than the input size" + raise ValueError(msg) # create the signal profile # (cumulative sum of deviations from the mean => "walk") walk = np.cumsum(data - np.mean(data)) @@ -2218,5 +2180,4 @@ def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fname=plot_file) if debug_data: return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) - else: - return poly[0] + return poly[0] diff --git a/nolds/test_measures.py b/nolds/test_measures.py index b56c3e6..a2123d4 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -1,17 +1,12 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import ( - bytes, dict, int, list, object, range, str, ascii, chr, hex, input, next, - oct, open, pow, round, super, filter, map, zip -) +import unittest +import warnings + import numpy as np +from nolds import datasets + # import internal module to test helping functions from nolds import measures as nolds -from nolds import datasets -import unittest -import warnings # TODO add tests for mfhurst_b and mfhurst_dm @@ -27,52 +22,47 @@ class TestNoldsHelperFunctions(unittest.TestCase): - """ - Tests for internal helper functions that are not part of the public API - """ - def assert_array_equals(self, expected, actual, print_arrays=False): + """Tests for internal helper functions that are not part of the public API.""" + def assert_array_equals(self, expected, actual, print_arrays=False) -> None: if print_arrays: - print(actual) - print("==") - print(expected) - print() - self.assertTrue(np.all(actual == expected)) + pass + assert np.all(actual == expected) - def test_delay_embed_lag2(self): + def test_delay_embed_lag2(self) -> None: data = np.arange(10, dtype="float32") embedded = nolds.delay_embedding(data, 4, lag=2) expected = np.array([ [0, 2, 4, 6], [1, 3, 5, 7], [2, 4, 6, 8], - [3, 5, 7, 9] + [3, 5, 7, 9], ], dtype="float32") self.assert_array_equals(expected, embedded) - def test_delay_embed(self): + def test_delay_embed(self) -> None: data = np.arange(6, dtype="float32") embedded = nolds.delay_embedding(data, 4) expected = np.array([ [0, 1, 2, 3], [1, 2, 3, 4], - [2, 3, 4, 5] + [2, 3, 4, 5], ], dtype="float32") self.assert_array_equals(expected, embedded) - def test_delay_embed_lag3(self): + def test_delay_embed_lag3(self) -> None: data = np.arange(10, dtype="float32") embedded = nolds.delay_embedding(data, 4, lag=3) expected = np.array([ - [0, 3, 6, 9] + [0, 3, 6, 9], ], dtype="float32") self.assert_array_equals(expected, embedded) - def test_delay_embed_empty(self): + def test_delay_embed_empty(self) -> None: data = np.arange(10, dtype="float32") try: embedded = nolds.delay_embedding(data, 11) msg = "embedding array of size 10 with embedding dimension 11 " \ - + "should fail, got {} instead" + "should fail, got {} instead" self.fail(msg.format(embedded)) except ValueError: pass @@ -80,38 +70,34 @@ def test_delay_embed_empty(self): try: embedded = nolds.delay_embedding(data, 4, lag=4) msg = "embedding array of size 10 with embedding dimension 4 and " \ - + "lag 4 should fail, got {} instead" + "lag 4 should fail, got {} instead" self.fail(msg.format(embedded)) except ValueError: pass class TestNoldsUtility(unittest.TestCase): - """ - Tests for small utility functions that are part of the public API - """ - def test_binary_n(self): + """Tests for small utility functions that are part of the public API.""" + def test_binary_n(self) -> None: x = nolds.binary_n(1000, min_n=50) self.assertSequenceEqual(x, [500, 250, 125, 62]) - def test_binary_n_empty(self): + def test_binary_n_empty(self) -> None: x = nolds.binary_n(50, min_n=50) self.assertSequenceEqual(x, []) - def test_logarithmic_n(self): + def test_logarithmic_n(self) -> None: x = nolds.logarithmic_n(4, 11, 1.51) self.assertSequenceEqual(x, [4, 6, 9]) - def test_logarithmic_r(self): + def test_logarithmic_r(self) -> None: x = nolds.logarithmic_r(4, 10, 1.51) self.assertSequenceEqual(x, [4, 6.04, 9.1204]) class TestNoldsLyap(unittest.TestCase): - """ - Tests for lyap_e and lyap_r - """ - def test_lyap_logistic(self): + """Tests for lyap_e and lyap_r.""" + def test_lyap_logistic(self) -> None: rvals = [2.5, 3.4, 3.7, 4.0] sign = [-1, -1, 1, 1] x0 = 0.1 @@ -128,12 +114,12 @@ def logistic(x, r): log = np.array(log, dtype="float32") le = np.max(nolds.lyap_e(log, emb_dim=6, matrix_dim=2)) lr = nolds.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) - self.assertEqual(s, int(np.sign(le)), "r = {}".format(r)) - self.assertEqual(s, int(np.sign(lr)), "r = {}".format(r)) + assert s == int(np.sign(le)), f"r = {r}" + assert s == int(np.sign(lr)), f"r = {r}" - def test_lyap_lorenz(self): + def test_lyap_lorenz(self) -> None: """Test hypothesis: Both lyap_r and lyap_e can reconstruct the largest Lyapunov exponent of the Lorenz system. - + The parameters for generating the Lorenz system were chosen to be as close as possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) and . @@ -147,37 +133,35 @@ def test_lyap_lorenz(self): doi: 10.1016/j.amc.2014.12.132. """ data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - lyap_r_args = dict(min_tsep=10, emb_dim=5, tau=0.01, lag=5, trajectory_len=28, fit_offset=8, fit="poly") + lyap_r_args = {"min_tsep": 10, "emb_dim": 5, "tau": 0.01, "lag": 5, "trajectory_len": 28, "fit_offset": 8, "fit": "poly"} lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) - lyap_e_args = dict(min_tsep=10, emb_dim=5, matrix_dim=5, tau=0.01, min_nb=8) + lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": 0.01, "min_nb": 8} lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) - self.assertGreater(lyap_ex[0], 1.5) - self.assertGreater(lyap_ey[0], 1.5) - self.assertGreater(lyap_ez[0], 1.5) + assert lyap_ex[0] > 1.5 + assert lyap_ey[0] > 1.5 + assert lyap_ez[0] > 1.5 - def test_lyap_fbm(self): + def test_lyap_fbm(self) -> None: data = datasets.fbm(1000, H=0.3) le = nolds.lyap_e(data, emb_dim=7, matrix_dim=3) - self.assertGreater(np.max(le), 0) + assert np.max(le) > 0 - def test_lyap_r_limits(self): - """ - tests if minimal input size is correctly calculated - """ + def test_lyap_r_limits(self) -> None: + """Tests if minimal input size is correctly calculated.""" np.random.seed(0) for i in range(10): kwargs = { "emb_dim": np.random.randint(1,10), "lag": np.random.randint(1,6), "min_tsep": np.random.randint(0,5), - "trajectory_len": np.random.randint(2,10) + "trajectory_len": np.random.randint(2,10), } min_len = nolds.lyap_r_len(**kwargs) for i in reversed(range(max(1,min_len-5),min_len+5)): @@ -189,39 +173,34 @@ def test_lyap_r_limits(self): warnings.simplefilter("ignore", RuntimeWarning) nolds.lyap_r(data, fit="poly", **kwargs) msg = "{} data points should be required for kwargs {}, but " \ - + "{} where enough" + "{} where enough" self.fail(msg.format( min_len, kwargs, - i + i, )) - except ValueError as e: + except ValueError: #print(e) pass else: ## enough data points => execution should succeed msg = "{} data points should be enough for kwargs {}, but " \ - + " {} where too few" + " {} where too few" try: - self.assertTrue( - np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), - msg.format(min_len, kwargs, i) - ) + assert np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), msg.format(min_len, kwargs, i) except ValueError as e: self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e) + msg.format(min_len, kwargs, i) + ", original error: "+str(e), ) - def test_lyap_e_limits(self): - """ - tests if minimal input size is correctly calculated - """ + def test_lyap_e_limits(self) -> None: + """Tests if minimal input size is correctly calculated.""" np.random.seed(1) for i in range(10): kwargs = { "matrix_dim": np.random.randint(2,10), "min_tsep": np.random.randint(0,10), - "min_nb": np.random.randint(2,15) + "min_nb": np.random.randint(2,15), } kwargs["emb_dim"] = np.random.randint(1,4) \ * (kwargs["matrix_dim"] - 1) + 1 @@ -235,35 +214,30 @@ def test_lyap_e_limits(self): warnings.simplefilter("ignore", RuntimeWarning) nolds.lyap_e(data, **kwargs) msg = "{} data points should be required for kwargs {}, but " \ - + "{} where enough" + "{} where enough" self.fail(msg.format( min_len, kwargs, - i + i, )) - except ValueError as e: + except ValueError: #print(e) pass else: ## enough data points => execution should succeed msg = "{} data points should be enough for kwargs {}, but " \ - + " {} where too few" + " {} where too few" try: - self.assertTrue( - np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), - msg.format(min_len, kwargs, i) - ) + assert np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), msg.format(min_len, kwargs, i) except ValueError as e: self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e) + msg.format(min_len, kwargs, i) + ", original error: "+str(e), ) class TestNoldsHurst(unittest.TestCase): - """ - Tests for hurst_rs - """ - def test_hurst_basic(self): + """Tests for hurst_rs.""" + def test_hurst_basic(self) -> None: np.random.seed(2) # strong negative correlation between successive elements seq_neg = [] @@ -274,7 +248,7 @@ def test_hurst_basic(self): h_neg = nolds.hurst_rs(seq_neg) #print("h_neg = %.3f" % h_neg) # expected h is around 0 - self.assertLess(h_neg, 0.3) + assert h_neg < 0.3 # no correlation, just random noise x = np.random.randn(10000) @@ -289,12 +263,10 @@ def test_hurst_basic(self): h_walk = nolds.hurst_rs(walk) #print("h_walk = %.3f" % h_walk) # expected h is around 1.0 - self.assertGreater(h_walk, 0.9) + assert h_walk > 0.9 - def test_hurst_pracma(self): - """ - Tests for hurst_rs using the same tests as in the R-package pracma - """ + def test_hurst_pracma(self) -> None: + """Tests for hurst_rs using the same tests as in the R-package pracma.""" np.random.seed(3) # This test reproduces the results presented by Ian L. Kaplan on # bearcave.com @@ -313,8 +285,8 @@ def test_hurst_pracma(self): hlm = nolds.hurst_rs(xlm, fit="poly", nvals=2**np.arange(3,11)) #print("hlm = %.3f" % hlm) self.assertAlmostEqual(hlm, 0.43, delta=0.05) - - def test_hurst_lorenz(self): + + def test_hurst_lorenz(self) -> None: """Test hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) @@ -328,7 +300,7 @@ def test_hurst_lorenz(self): 2009, doi: 10.1007/s11207-009-9467-x. """ data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - hurst_rs_args = dict(fit="poly", nvals=nolds.logarithmic_n(10, 70, 1.1)) + hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) @@ -337,10 +309,8 @@ def test_hurst_lorenz(self): self.assertAlmostEqual(0.9, hz, delta=0.05) class TestNoldsDFA(unittest.TestCase): - """ - Tests for dfa - """ - def test_dfa_base(self): + """Tests for dfa.""" + def test_dfa_base(self) -> None: np.random.seed(4) # strong negative correlation between successive elements seq_neg = [] @@ -350,30 +320,30 @@ def test_dfa_base(self): seq_neg.append(x) h_neg = nolds.dfa(seq_neg) # expected h is around 0 - self.assertLess(h_neg, 0.3) + assert h_neg < 0.3 # no correlation, just random noise x = np.random.randn(10000) h_rand = nolds.dfa(x) # expected h is around 0.5 - self.assertLess(h_rand, 0.7) - self.assertGreater(h_rand, 0.3) + assert h_rand < 0.7 + assert h_rand > 0.3 # cumulative sum has strong positive correlation between # elements walk = np.cumsum(x) h_walk = nolds.dfa(walk) # expected h is around 1.0 - self.assertGreater(h_walk, 0.7) + assert h_walk > 0.7 - def test_dfa_fbm(self): + def test_dfa_fbm(self) -> None: hs = [0.3, 0.5, 0.7] for h in hs: data = datasets.fbm(1000, H=h) he = nolds.dfa(data) self.assertAlmostEqual(he, h + 1, delta=0.15) - def test_dfa_lorenz(self): + def test_dfa_lorenz(self) -> None: """Test hypothesis: We get correct values for estimating the Hurst parameter of the Lorenz system. All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) @@ -390,7 +360,7 @@ def test_dfa_lorenz(self): """ data = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] nvals = nolds.logarithmic_n(200, len(data)/8, 2**0.2) - dfa_args = dict(nvals=nvals, order=2, overlap=False, fit_exp="poly") + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} dx = nolds.dfa(data[:, 0], **dfa_args) dy = nolds.dfa(data[:, 1], **dfa_args) dz = nolds.dfa(data[:, 2], **dfa_args) @@ -398,17 +368,17 @@ def test_dfa_lorenz(self): self.assertAlmostEqual(0.926, dy, delta=0.032) self.assertAlmostEqual(0.650, dz, delta=0.44) - def test_dfa_agreement_with_physionet(self): + def test_dfa_agreement_with_physionet(self) -> None: """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" lorenz_x, physionet_points = datasets.load_lorenz_physionet() nvals = [round(x) for x in 10 ** physionet_points[:,0]] _, (_, nolds_rs, _) = nolds.dfa(lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True) nolds_rs_log10 = nolds_rs / np.log(10) # assert that sum of squared errors is less than 1e-9 - self.assertLess(sum((physionet_points[:,1] - nolds_rs_log10)**2), 1e-9) + assert sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) < 1e-09 @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") - def test_dfa_levy(self): + def test_dfa_levy(self) -> None: """Test hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. @@ -421,10 +391,8 @@ def test_dfa_levy(self): class TestNoldsCorrDim(unittest.TestCase): - """ - Tests for corr_dim - """ - def test_corr_dim(self): + """Tests for corr_dim.""" + def test_corr_dim(self) -> None: np.random.seed(5) n = 1000 data = np.arange(n) @@ -436,7 +404,7 @@ def test_corr_dim(self): self.assertAlmostEqual(cd, 0.5, delta=0.15) # TODO test example for cd > 1 - def test_lorenz(self): + def test_lorenz(self) -> None: """Test hypothesis: We get correct values for estimating the correlation dimension of the Lorenz system. All parameter values are chosen to replicate the experiment by Grassberger and Procaccia (1983) @@ -461,16 +429,14 @@ def test_lorenz(self): cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) self.assertAlmostEqual(cd, 2.05, delta=0.2) - def test_logistic(self): + def test_logistic(self) -> None: # TODO replicate tests with logistic map from grassberger-procaccia pass class TestNoldsSampEn(unittest.TestCase): - """ - Tests for sampen - """ - def test_sampen_base(self): + """Tests for sampen.""" + def test_sampen_base(self) -> None: data = [0, 1, 5, 4, 1, 0, 1, 5, 3] # matches for m=2: 01-01, 15-15 # matches for m=3: 015-015 @@ -489,7 +455,7 @@ def test_sampen_base(self): se = nolds.sampen(data, emb_dim=3, tolerance=0.5) self.assertAlmostEqual(se, -np.log(1.0/4), delta=0.01) - def test_sampen_logistic(self): + def test_sampen_logistic(self) -> None: # logistic map with r = 2.8 => static value data = list(datasets.logistic_map(0.45, 1000, r=2.8)) self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) @@ -507,7 +473,7 @@ def test_sampen_logistic(self): self.assertAlmostEqual(0.5, nolds.sampen(data[100:]), delta=0.1) self.assertAlmostEqual(0.5, nolds.sampen(data[100:], emb_dim=5), delta=0.1) - def test_sampen_random(self): + def test_sampen_random(self) -> None: np.random.seed(6) # normally distributed random numbers data = np.random.randn(10000) @@ -515,12 +481,12 @@ def test_sampen_random(self): self.assertAlmostEqual(2.2, nolds.sampen(data, emb_dim=2), delta=0.1) # TODO add tests with uniformly distributed random numbers - def test_sampen_sinus(self): + def test_sampen_sinus(self) -> None: # TODO add test with sinus signal pass - def test_sampen_lorenz(self): + def test_sampen_lorenz(self) -> None: """Test hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) @@ -535,7 +501,7 @@ def test_sampen_lorenz(self): pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. """ data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - sampen_args = dict(emb_dim=2, lag=1) + sampen_args = {"emb_dim": 2, "lag": 1} sx = nolds.sampen(data[:, 0], **sampen_args) sy = nolds.sampen(data[:, 1], **sampen_args) sz = nolds.sampen(data[:, 2], **sampen_args) @@ -551,20 +517,20 @@ class RegressionTests(unittest.TestCase): as updates to core dependencies such as numpy or the Python standard library. """ - def test_sampen(self): + def test_sampen(self) -> None: """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] se = nolds.sampen(data, emb_dim=2, tolerance=None, lag=1, dist=nolds.rowwise_chebyshev, closed=False) self.assertAlmostEqual(2.1876999522832743, se, places=14) - def test_corr_dim(self): + def test_corr_dim(self) -> None: """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=None, dist=nolds.rowwise_euclidean, fit="poly") self.assertAlmostEqual(1.303252839255068, cd, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_corr_dim_RANSAC(self): + def test_corr_dim_RANSAC(self) -> None: """Test hypothesis: The exact output of corr_dim() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] sd = np.std(data, ddof=1) @@ -575,7 +541,7 @@ def test_corr_dim_RANSAC(self): cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=rvals, dist=nolds.rowwise_euclidean, fit="RANSAC") self.assertAlmostEqual(0.44745494643404665, cd, places=14) - def test_lyap_e(self): + def test_lyap_e(self) -> None: """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] le = nolds.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) @@ -583,7 +549,7 @@ def test_lyap_e(self): for i in range(le.shape[0]): self.assertAlmostEqual(expected[i], le[i], places=14, msg=f"{i+1}th Lyapunov exponent doesn't match") - def test_lyap_r(self): + def test_lyap_r(self) -> None: """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] le = nolds.lyap_r(data, emb_dim=10, lag=1, min_tsep=1, tau=1, min_neighbors=10, trajectory_len=10, fit="poly") @@ -591,7 +557,7 @@ def test_lyap_r(self): self.assertAlmostEqual(expected, le, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_lyap_r_RANSAC(self): + def test_lyap_r_RANSAC(self) -> None: """Test hypothesis: The exact output of lyap_r() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) @@ -601,7 +567,7 @@ def test_lyap_r_RANSAC(self): expected = 0.0003401212353253564 self.assertAlmostEqual(expected, le, places=14) - def test_hurst_rs(self): + def test_hurst_rs(self) -> None: """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] rs = nolds.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) @@ -609,7 +575,7 @@ def test_hurst_rs(self): self.assertAlmostEqual(expected, rs, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_hurst_rs_RANSAC(self): + def test_hurst_rs_RANSAC(self) -> None: """Test hypothesis: The exact output of hurst_rs() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) @@ -619,7 +585,7 @@ def test_hurst_rs_RANSAC(self): expected = 0.4805431939943321 self.assertAlmostEqual(expected, rs, places=14) - def test_dfa(self): + def test_dfa(self) -> None: """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] h = nolds.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") @@ -627,7 +593,7 @@ def test_dfa(self): self.assertAlmostEqual(expected, h, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_dfa_RANSAC(self): + def test_dfa_RANSAC(self) -> None: """Test hypothesis: The exact output of dfa() with `fit_exp=RANSAC` on random data hasn't changed since the last version.""" # adds trend to data to introduce a less clear line for fitting data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 @@ -638,7 +604,7 @@ def test_dfa_RANSAC(self): expected = 1.1372303125405405 self.assertAlmostEqual(expected, h, places=14) - def test_mfhurst_b(self): + def test_mfhurst_b(self) -> None: """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="poly") @@ -646,7 +612,7 @@ def test_mfhurst_b(self): self.assertAlmostEqual(expected[0], h[0], places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_b_RANSAC(self): + def test_mfhurst_b_RANSAC(self) -> None: """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) @@ -654,7 +620,7 @@ def test_mfhurst_b_RANSAC(self): expected = [-0.009056463064211057] self.assertAlmostEqual(expected[0], h[0], places=14) - def test_mfhurst_dm(self): + def test_mfhurst_dm(self) -> None: """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly") @@ -662,7 +628,7 @@ def test_mfhurst_dm(self): self.assertAlmostEqual(expected[0], h[0], places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_dm_RANSAC(self): + def test_mfhurst_dm_RANSAC(self) -> None: """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) @@ -674,9 +640,9 @@ def test_mfhurst_dm_RANSAC(self): class PreviousDefectTests(unittest.TestCase): """Tests that ensure that a previous bug doesn't come back at some point.""" - def test_lyap_r_complex_min_tsep(self): + def test_lyap_r_complex_min_tsep(self) -> None: """Test hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. - + Previously, this would lead to an exception in the code. See https://github.com/CSchoel/nolds/issues/53 for reference. """ diff --git a/pyproject.toml b/pyproject.toml index 314114a..66fd0b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -65,7 +65,7 @@ build-backend = "hatchling.build" [tool.ruff] line-length = 100 # allow slightly longer lines -indent-width = 2 # use two spaces for now to not upset existing code +indent-width = 4 # Assume Python 3.8 target-version = "py38" From eacd58e5ac02d71b8a65371f0478cded949341fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Wed, 11 Jun 2025 21:48:10 +0200 Subject: [PATCH 02/36] applies ruff formatter --- doc/source/conf.py | 123 +- nolds/datasets.py | 600 +++--- nolds/examples.py | 1070 +++++----- nolds/measures.py | 4199 ++++++++++++++++++++-------------------- nolds/test_measures.py | 1278 ++++++------ pyproject.toml | 1 + 6 files changed, 3741 insertions(+), 3530 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 3fa3297..ace8ab5 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -18,13 +18,13 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) +# sys.path.insert(0, os.path.abspath('.')) sys.path.insert(0, os.path.abspath("../..")) # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -46,7 +46,7 @@ source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. master_doc = "index" @@ -74,9 +74,9 @@ # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -84,27 +84,27 @@ # The reST default role (used for this markup: `text`) to use for all # documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False +# keep_warnings = False # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False @@ -124,23 +124,23 @@ } # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -150,62 +150,62 @@ # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied # directly to the root of the documentation. -#html_extra_path = [] +# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Language to be used for generating the HTML full-text search index. # Sphinx supports the following languages: # 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' # 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' -#html_search_language = 'en' +# html_search_language = 'en' # A dictionary with options for the search language support, empty by default. # Now only 'ja' uses this config value -#html_search_options = {'type': 'default'} +# html_search_options = {'type': 'default'} # The name of a javascript file (relative to the configuration directory) that # implements a search results scorer. If empty, the default will be used. -#html_search_scorer = 'scorer.js' +# html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. htmlhelp_basename = "Noldsdoc" @@ -213,46 +213,42 @@ # -- Options for LaTeX output --------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', - -# Latex figure (float) alignment -#'figure_align': 'htbp', + # The paper size ('letterpaper' or 'a4paper'). + #'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + #'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + #'preamble': '', + # Latex figure (float) alignment + #'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, "Nolds.tex", "Nolds Documentation", - "Christopher Schölzel", "manual"), + (master_doc, "Nolds.tex", "Nolds Documentation", "Christopher Schölzel", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output --------------------------------------- @@ -260,12 +256,11 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, "nolds", "Nolds Documentation", - [author], 1), + (master_doc, "nolds", "Nolds Documentation", [author], 1), ] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------- @@ -274,21 +269,27 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, "Nolds", "Nolds Documentation", - author, "Nolds", "One line description of project.", - "Miscellaneous"), + ( + master_doc, + "Nolds", + "Nolds Documentation", + author, + "Nolds", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False +# texinfo_no_detailmenu = False # autodoc_mock_imports = ['numpy', 'future', 'setuptools', 'builtins'] diff --git a/nolds/datasets.py b/nolds/datasets.py index 7195d60..7e5a6af 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -5,238 +5,249 @@ def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=None): - """Simulates the Lorenz system using a simple Euler method. + """Simulates the Lorenz system using a simple Euler method. + + The Lorenz system is a three dimensional dynamical system given + by the following equations: + + dx/dt = sigma * (y - x) + dy/dt = rho * x - y - x * z + dz/dt = x * y - beta * z + """ + if start is None: + start = [1, 1, 1] + + def lorenz(state, sigma, rho, beta): + x, y, z = state + # NOTE: Numpy 1.x stores intermediate results as float64 + # => to achieve consistency between numpy versions, we have to use + # float32 for all values that enter the formula to simulate numpy 1.x + # behavior with numpy 2.x. + return np.array( + [ + np.float32(sigma) * (y - x), + np.float32(rho) * x - y - x * z, + x * y - np.float32(beta) * z, + ], + dtype="float32", + ) + + trajectory = np.zeros((length, 3), dtype="float32") + trajectory[0] = start + for i in range(1, length): + # t = i * dt + trajectory[i] = trajectory[i - 1] + lorenz(trajectory[i - 1], sigma, rho, beta) * dt + return trajectory - The Lorenz system is a three dimensional dynamical system given - by the following equations: - - dx/dt = sigma * (y - x) - dy/dt = rho * x - y - x * z - dz/dt = x * y - beta * z - """ - if start is None: - start = [1, 1, 1] - def lorenz(state, sigma, rho, beta): - x, y, z = state - # NOTE: Numpy 1.x stores intermediate results as float64 - # => to achieve consistency between numpy versions, we have to use - # float32 for all values that enter the formula to simulate numpy 1.x - # behavior with numpy 2.x. - return np.array([ - np.float32(sigma) * (y - x), - np.float32(rho) * x - y - x * z, - x * y - np.float32(beta) * z, - ], dtype="float32") - trajectory = np.zeros((length, 3), dtype="float32") - trajectory[0] = start - for i in range(1, length): - # t = i * dt - trajectory[i] = trajectory[i-1] + lorenz(trajectory[i-1], sigma, rho, beta) * dt - return trajectory def lorenz_lyap(sigma, rho, beta): - """Calculates the exact Lyapunov dimension of the Lorenz system according to - Leonov 2015 [ll_1]_. + """Calculates the exact Lyapunov dimension of the Lorenz system according to + Leonov 2015 [ll_1]_. - References: - .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the - analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, - vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. - """ - return 3 - 2 * (sigma + beta + 1) / (sigma + 1 + np.sqrt((sigma-1) ** 2 + 4 * sigma * rho)) + References: + .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the + analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, + vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. + """ + return 3 - 2 * (sigma + beta + 1) / (sigma + 1 + np.sqrt((sigma - 1) ** 2 + 4 * sigma * rho)) def fbm(n, H=0.75): - """Generates fractional brownian motions of desired length. - - Author: - Christian Thomae - - References: - .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation - - Args: - n (int): - length of sequence to generate - Kwargs: - H (float): - hurst parameter - - Returns: - array of float: - simulated fractional brownian motion - """ - # TODO more detailed description of fbm - assert H > 0 - assert H < 1 - - def R(t, s): - twoH = 2 * H - return 0.5 * (s**twoH + t**twoH - np.abs(t - s)**twoH) - # form the matrix tau - gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix - w, P = np.linalg.eigh(gamma) - L = np.diag(w) - sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) - v = np.random.randn(n) - return np.dot(sigma, v) + """Generates fractional brownian motions of desired length. + + Author: + Christian Thomae + + References: + .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation + + Args: + n (int): + length of sequence to generate + Kwargs: + H (float): + hurst parameter + + Returns: + array of float: + simulated fractional brownian motion + """ + # TODO more detailed description of fbm + assert H > 0 + assert H < 1 + + def R(t, s): + twoH = 2 * H + return 0.5 * (s**twoH + t**twoH - np.abs(t - s) ** twoH) + + # form the matrix tau + gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix + w, P = np.linalg.eigh(gamma) + L = np.diag(w) + sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) + v = np.random.randn(n) + return np.dot(sigma, v) def fgn(n, H=0.75): - """Generates fractional gaussian noise of desired length. + """Generates fractional gaussian noise of desired length. - References: - .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion + References: + .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion - Args: - n (int): - length of sequence to generate + Args: + n (int): + length of sequence to generate - Kwargs: - H (float): - hurst parameter + Kwargs: + H (float): + hurst parameter - Returns: - array of float: - simulated fractional gaussian noise - """ - return np.diff(fbm(n+1, H=H)) + Returns: + array of float: + simulated fractional gaussian noise + """ + return np.diff(fbm(n + 1, H=H)) def qrandom(n): - """Creates an array of n true random numbers obtained from the quantum random - number generator at qrng.anu.edu.au. + """Creates an array of n true random numbers obtained from the quantum random + number generator at qrng.anu.edu.au. - This function requires the package quantumrandom and an internet connection. + This function requires the package quantumrandom and an internet connection. - Args: - n (int): - length of the random array + Args: + n (int): + length of the random array - Return: - array of ints: - array of truly random unsigned 16 bit int values - """ - import quantumrandom - return np.concatenate([ - quantumrandom.get_data(data_type="uint16", array_length=1024) - for i in range(int(np.ceil(n/1024.0))) - ])[:n] + Return: + array of ints: + array of truly random unsigned 16 bit int values + """ + import quantumrandom + + return np.concatenate( + [ + quantumrandom.get_data(data_type="uint16", array_length=1024) + for i in range(int(np.ceil(n / 1024.0))) + ] + )[:n] def load_qrandom(): - """Loads a set of 10000 random numbers generated by qrandom. + """Loads a set of 10000 random numbers generated by qrandom. - This dataset can be used when you want to do some limited tests with "true" - random data without an internet connection. + This dataset can be used when you want to do some limited tests with "true" + random data without an internet connection. - Returns: - int array - the dataset - """ - fname = "datasets/qrandom.npy" - with pkg_resources.resource_stream(__name__, fname) as f: - return np.load(f) + Returns: + int array + the dataset + """ + fname = "datasets/qrandom.npy" + with pkg_resources.resource_stream(__name__, fname) as f: + return np.load(f) def load_brown72(): - """Loads the dataset brown72 with a prescribed Hurst exponent of 0.72. + """Loads the dataset brown72 with a prescribed Hurst exponent of 0.72. - Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html + Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html - Returns: - float array: - the dataset - """ - fname = "datasets/brown72.npy" - with pkg_resources.resource_stream(__name__, fname) as f: - return np.load(f) + Returns: + float array: + the dataset + """ + fname = "datasets/brown72.npy" + with pkg_resources.resource_stream(__name__, fname) as f: + return np.load(f) def load_lorenz_physionet(): - """Loads a dataset containing the X variable of the Lorenz system - as well as the output of PhysioNet's dfa implementation on that dataset. + """Loads a dataset containing the X variable of the Lorenz system + as well as the output of PhysioNet's dfa implementation on that dataset. - The input data was created with the following code: + The input data was created with the following code: - data = datasets.lorenz_euler( - 3000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.012 - )[1000:,0] + data = datasets.lorenz_euler( + 3000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.012 + )[1000:,0] - The ouptut from PhysioNet was created by calling: + The ouptut from PhysioNet was created by calling: - dfa < lorenz.txt > lorenz_physionet.txt + dfa < lorenz.txt > lorenz_physionet.txt - Returns: - 1d float array: - time series of the X variable of the Lorenz system that was used as input - 2d float array: - x- and y-coordinates of the line fitting step in the PhysioNet output - """ - fname = "datasets/lorenz.txt" - with pkg_resources.resource_stream(__name__, fname) as f: - data_in = np.loadtxt(f) - fname = "datasets/lorenz_physionet.txt" - with pkg_resources.resource_stream(__name__, fname) as f: - data_out = np.loadtxt(f) - return data_in, data_out + Returns: + 1d float array: + time series of the X variable of the Lorenz system that was used as input + 2d float array: + x- and y-coordinates of the line fitting step in the PhysioNet output + """ + fname = "datasets/lorenz.txt" + with pkg_resources.resource_stream(__name__, fname) as f: + data_in = np.loadtxt(f) + fname = "datasets/lorenz_physionet.txt" + with pkg_resources.resource_stream(__name__, fname) as f: + data_out = np.loadtxt(f) + return data_in, data_out def tent_map(x, steps, mu=2): - """Generates a time series of the tent map. + """Generates a time series of the tent map. - Characteristics and Background: - The name of the tent map is derived from the fact that the plot of x_i vs - x_i+1 looks like a tent. For mu > 1 one application of the mapping function - can be viewed as stretching the surface on which the value is located and - then folding the area that is greater than one back towards the zero. This - corresponds nicely to the definition of chaos as expansion in one dimension - which is counteracted by a compression in another dimension. + Characteristics and Background: + The name of the tent map is derived from the fact that the plot of x_i vs + x_i+1 looks like a tent. For mu > 1 one application of the mapping function + can be viewed as stretching the surface on which the value is located and + then folding the area that is greater than one back towards the zero. This + corresponds nicely to the definition of chaos as expansion in one dimension + which is counteracted by a compression in another dimension. - Calculating the Lyapunov exponent: - The lyapunov exponent of the tent map can be easily calculated as due to - this stretching behavior a small difference delta between two neighboring - points will indeed grow exponentially by a factor of mu in each iteration. - We thus can assume that: + Calculating the Lyapunov exponent: + The lyapunov exponent of the tent map can be easily calculated as due to + this stretching behavior a small difference delta between two neighboring + points will indeed grow exponentially by a factor of mu in each iteration. + We thus can assume that: - delta_n = delta_0 * mu^n + delta_n = delta_0 * mu^n - We now only have to change the basis to e to obtain the exact formula that - is used for the definition of the lyapunov exponent: + We now only have to change the basis to e to obtain the exact formula that + is used for the definition of the lyapunov exponent: - delta_n = delta_0 * e^(ln(mu) * n) + delta_n = delta_0 * e^(ln(mu) * n) - Therefore the lyapunov exponent of the tent map is: + Therefore the lyapunov exponent of the tent map is: - lambda = ln(mu) + lambda = ln(mu) - References: - .. [tm_1] https://en.wikipedia.org/wiki/Tent_map + References: + .. [tm_1] https://en.wikipedia.org/wiki/Tent_map - Args: - x (float): - starting point - steps (int): - number of steps for which the generator should run + Args: + x (float): + starting point + steps (int): + number of steps for which the generator should run - Kwargs: - mu (int): - parameter mu that controls the behavior of the map + Kwargs: + mu (int): + parameter mu that controls the behavior of the map + + Returns: + generator object: + the generator that creates the time series + """ + for _ in range(steps): + x = mu * x if x < 0.5 else mu * (1 - x) + yield x - Returns: - generator object: - the generator that creates the time series - """ - for _ in range(steps): - x = mu * x if x < 0.5 else mu * (1 - x) - yield x # TODO should all math be formatted like this, or should the documentation of # logistic_map revert to a version that is more readable as plain text def logistic_map(x, steps, r=4): - r"""Generates a time series of the logistic map. + r"""Generates a time series of the logistic map. Characteristics and Background: The logistic map is among the simplest examples for a time series that can @@ -321,132 +332,133 @@ def logistic_map(x, steps, r=4): generator object: the generator that creates the time series """ - for _ in range(steps): - x = r * x * (1 - x) - yield x + for _ in range(steps): + x = r * x * (1 - x) + yield x def load_financial(): - """Loads the following datasets from CSV files in this package: - - - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - ndx: NASDAQ 100, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5ENDX/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - - All datasets are daily prices from the period from 1990-01-01 to 2001-05-01 - missing values are NaN except for opening values which are treated as - follows: - - - If the first opening value is missing, the first *existing* opening value - is used for the first day. - - All other missing opening values are filled by the close value of the last - day where data was available. - - Returns: - list of tuple(1d-array, 2d-array): - datasets with days as array of date objects and 2d-array with the columns - "Open", "High", "Low", "Close", "Adj Close", and "Volume". Note that - "Open" values have been padded to ensure that there are no NaNs left. - """ - - def load_finance_yahoo_data(f): - f.readline() - days = [] - values = [] - for l in f: - fields = l.decode("utf-8") - fields = fields.split(",") - d = datetime.datetime.strptime(fields[0], "%Y-%m-%d") - v = [np.nan if x.strip() == "null" else float(x) for x in fields[1:]] - days.append(d) - values.append(v) - return np.array(days), np.array(values) - - def pad_opening_values(values) -> None: - # fill first value from future if required - first = 0 - while np.isnan(values[first, 0]): - first += 1 - values[0, 0] = values[first, 0] - # iterate over all indices where data is missing - for i in np.where(np.isnan(values[:, 0]))[0]: - j = i - # pad opening value with close value of previous data - while np.isnan(values[j][3]): - j -= 1 - values[i, 0] = values[j, 3] - - data = [] - for index in ["^JKSE", "^N225", "^NDX"]: - fname = f"datasets/{index}.csv" - with pkg_resources.resource_stream(__name__, fname) as f: - days, values = load_finance_yahoo_data(f) - pad_opening_values(values) - data.append((days, values)) - return data + """Loads the following datasets from CSV files in this package: + + - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + - ndx: NASDAQ 100, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5ENDX/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d + + All datasets are daily prices from the period from 1990-01-01 to 2001-05-01 + missing values are NaN except for opening values which are treated as + follows: + + - If the first opening value is missing, the first *existing* opening value + is used for the first day. + - All other missing opening values are filled by the close value of the last + day where data was available. + + Returns: + list of tuple(1d-array, 2d-array): + datasets with days as array of date objects and 2d-array with the columns + "Open", "High", "Low", "Close", "Adj Close", and "Volume". Note that + "Open" values have been padded to ensure that there are no NaNs left. + """ + + def load_finance_yahoo_data(f): + f.readline() + days = [] + values = [] + for l in f: + fields = l.decode("utf-8") + fields = fields.split(",") + d = datetime.datetime.strptime(fields[0], "%Y-%m-%d") + v = [np.nan if x.strip() == "null" else float(x) for x in fields[1:]] + days.append(d) + values.append(v) + return np.array(days), np.array(values) + + def pad_opening_values(values) -> None: + # fill first value from future if required + first = 0 + while np.isnan(values[first, 0]): + first += 1 + values[0, 0] = values[first, 0] + # iterate over all indices where data is missing + for i in np.where(np.isnan(values[:, 0]))[0]: + j = i + # pad opening value with close value of previous data + while np.isnan(values[j][3]): + j -= 1 + values[i, 0] = values[j, 3] + + data = [] + for index in ["^JKSE", "^N225", "^NDX"]: + fname = f"datasets/{index}.csv" + with pkg_resources.resource_stream(__name__, fname) as f: + days, values = load_finance_yahoo_data(f) + pad_opening_values(values) + data.append((days, values)) + return data def barabasi1991_fractal(size, iterations, b1=0.8, b2=0.5): - """Generates the simple fractal described in [bf]_. - - The fractal divides a rectangular segment starting at (x0, y0) with width w - and height h along the x axis into four line segments of equal size with the - boundary points [x0, x1, x2, x3, x4]. It has two parameters b1 and b2 that - allow to choose the value for y(x1) and y(x3) while it always holds that - y(x0) = y0, y(x2) = y0 and y(x4) = y0 + h. - - The process starts with a single line segment of height 1 spanning the whole - data range. In each iteration, the rectangles spanning the line segments - from the previous iteration are subdivided according to the same rule. - - References: - .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - - Args: - size (int): - number of data points in the resulting array - iterations (int): - number of iterations to perform - - Kwargs: - b1 (float): - relative height at x1 (between 0 and 1) - b2 (float): - relative height at x3 (between 0 and 1) - - Returns: - (1d-array of float): - generated fractal - """ - def b1991(x0, y0, w, h): - if h < 0: - # for a segment with negative slope we have flip the x-axis - d, nxtp = b1991(x0, y0 + h, w, -h) - return d[::-1], nxtp - x1 = x0 + w // 4 - x2 = x0 + w // 2 - x3 = x2 + w // 4 - x4 = x0 + w - data = np.zeros(w, dtype=np.float64) - data[x0 - x0:x1 - x0] = np.linspace(0, 1, x1 - x0) * b1 * h + y0 - data[x1 - x0:x2 - x0] = np.linspace(1, 0, x2 - x1) * b1 * h + y0 - data[x2 - x0:x3 - x0] = np.linspace(0, 1, x3 - x2) * b2 * h + y0 - data[x3 - x0:x4 - x0] = np.linspace(0, 1, x4 - x3) * (1 - b2) * h \ - + y0 + b2 * h - return data, [x0, x1, x2, x3, x4] - fractal = np.linspace(0, 1, size) - intervals = [(0, size)] - for _ in range(iterations): - next_intervals = [] - for x1, x2 in intervals: - d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2-1] - fractal[x1]) - fractal[x1:x2] = d - next_intervals.extend( - [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])], - ) - intervals = next_intervals - return fractal + """Generates the simple fractal described in [bf]_. + + The fractal divides a rectangular segment starting at (x0, y0) with width w + and height h along the x axis into four line segments of equal size with the + boundary points [x0, x1, x2, x3, x4]. It has two parameters b1 and b2 that + allow to choose the value for y(x1) and y(x3) while it always holds that + y(x0) = y0, y(x2) = y0 and y(x4) = y0 + h. + + The process starts with a single line segment of height 1 spanning the whole + data range. In each iteration, the rectangles spanning the line segments + from the previous iteration are subdivided according to the same rule. + + References: + .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + + Args: + size (int): + number of data points in the resulting array + iterations (int): + number of iterations to perform + + Kwargs: + b1 (float): + relative height at x1 (between 0 and 1) + b2 (float): + relative height at x3 (between 0 and 1) + + Returns: + (1d-array of float): + generated fractal + """ + + def b1991(x0, y0, w, h): + if h < 0: + # for a segment with negative slope we have flip the x-axis + d, nxtp = b1991(x0, y0 + h, w, -h) + return d[::-1], nxtp + x1 = x0 + w // 4 + x2 = x0 + w // 2 + x3 = x2 + w // 4 + x4 = x0 + w + data = np.zeros(w, dtype=np.float64) + data[x0 - x0 : x1 - x0] = np.linspace(0, 1, x1 - x0) * b1 * h + y0 + data[x1 - x0 : x2 - x0] = np.linspace(1, 0, x2 - x1) * b1 * h + y0 + data[x2 - x0 : x3 - x0] = np.linspace(0, 1, x3 - x2) * b2 * h + y0 + data[x3 - x0 : x4 - x0] = np.linspace(0, 1, x4 - x3) * (1 - b2) * h + y0 + b2 * h + return data, [x0, x1, x2, x3, x4] + + fractal = np.linspace(0, 1, size) + intervals = [(0, size)] + for _ in range(iterations): + next_intervals = [] + for x1, x2 in intervals: + d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2 - 1] - fractal[x1]) + fractal[x1:x2] = d + next_intervals.extend( + [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])], + ) + intervals = next_intervals + return fractal brown72 = load_brown72() diff --git a/nolds/examples.py b/nolds/examples.py index 1896269..91e443d 100644 --- a/nolds/examples.py +++ b/nolds/examples.py @@ -5,557 +5,583 @@ def weron_2002_figure2(n=10000) -> None: - """Recreates figure 2 of [w]_ comparing the reported values by Weron to the - values obtained by the functions in this package. - - The experiment consists of n iterations where the hurst exponent of randomly - generated gaussian noise is calculated. This is done with differing sequence - lengths of 256, 512, 1024, ...., 65536. The average estimated hurst exponent - over all iterations is plotted for the following configurations: - - * ``weron`` is the Anis-Lloyd-corrected Hurst exponent calculated by Weron - * ``rs50`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds - with the same parameters as used by Weron - * ``weron_raw`` is the uncorrected Hurst exponent calculated by Weron - * ``rs50_raw`` is the uncorrected Hurst exponent calculated by Nolds with the - same parameters as used by Weron - * ``rsn`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds with - the default settings of Nolds - - The values reported by Weron are only measured from the plot in the PDF - version of the paper and can therefore have some small inaccuracies. - - This function requires the package ``matplotlib``. - - References: - - .. [w] R. Weron, “Estimating long-range dependence: finite sample - properties and confidence intervals,” Physica A: Statistical Mechanics - and its Applications, vol. 312, no. 1, pp. 285–299, 2002. - - Kwargs: - n (int): - number of iterations of the experiment (Weron used 10000, but this takes - a while) - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - # note: these values are calculated by measurements in inkscape of the plot - # from the paper - reported = [6.708, 13.103, 20.240, 21.924, 22.256, 24.112, 24.054, 26.299, - 26.897] - reported_raw = [160.599, 141.663, 128.454, 115.617, 103.651, 95.481, 86.810, - 81.799, 76.270] - - def height_to_h(height): - return 0.49 + height / 29.894 * 0.01 - reported = height_to_h(np.array(reported)) - reported_raw = height_to_h(np.array(reported_raw)) - data = [] - for e in range(8, 17): - l = 2**e - nvals = 2**np.arange(6, e) - rsn = np.mean([ - nolds.hurst_rs(np.random.normal(size=l), fit="poly") - for _ in range(n) - ]) - rs50 = np.mean([ - nolds.hurst_rs(np.random.normal(size=l), fit="poly", nvals=nvals) - for _ in range(n) - ]) - rs50_raw = np.mean([ - nolds.hurst_rs( - np.random.normal(size=l), fit="poly", nvals=nvals, corrected=False, - ) - for _ in range(n) - ]) - data.append((rsn, rs50, rs50_raw)) - lines = plt.plot(np.arange(8, 17), data) - r = plt.plot(np.arange(8, 17), reported) - rr = plt.plot(np.arange(8, 17), reported_raw) - plt.legend(r + rr + lines, ("weron", "weron_raw", "rsn", "rs50", "rs50_raw")) - plt.xticks(np.arange(8, 17), 2**np.arange(8, 17)) - plt.xlabel("sequence length") - plt.ylabel("estimated hurst exponent") - plt.show() + """Recreates figure 2 of [w]_ comparing the reported values by Weron to the + values obtained by the functions in this package. + + The experiment consists of n iterations where the hurst exponent of randomly + generated gaussian noise is calculated. This is done with differing sequence + lengths of 256, 512, 1024, ...., 65536. The average estimated hurst exponent + over all iterations is plotted for the following configurations: + + * ``weron`` is the Anis-Lloyd-corrected Hurst exponent calculated by Weron + * ``rs50`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds + with the same parameters as used by Weron + * ``weron_raw`` is the uncorrected Hurst exponent calculated by Weron + * ``rs50_raw`` is the uncorrected Hurst exponent calculated by Nolds with the + same parameters as used by Weron + * ``rsn`` is the Anis-Lloyd-corrected Hurst exponent calculated by Nolds with + the default settings of Nolds + + The values reported by Weron are only measured from the plot in the PDF + version of the paper and can therefore have some small inaccuracies. + + This function requires the package ``matplotlib``. + + References: + + .. [w] R. Weron, “Estimating long-range dependence: finite sample + properties and confidence intervals,” Physica A: Statistical Mechanics + and its Applications, vol. 312, no. 1, pp. 285–299, 2002. + + Kwargs: + n (int): + number of iterations of the experiment (Weron used 10000, but this takes + a while) + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + # note: these values are calculated by measurements in inkscape of the plot + # from the paper + reported = [6.708, 13.103, 20.240, 21.924, 22.256, 24.112, 24.054, 26.299, 26.897] + reported_raw = [160.599, 141.663, 128.454, 115.617, 103.651, 95.481, 86.810, 81.799, 76.270] + + def height_to_h(height): + return 0.49 + height / 29.894 * 0.01 + + reported = height_to_h(np.array(reported)) + reported_raw = height_to_h(np.array(reported_raw)) + data = [] + for e in range(8, 17): + l = 2**e + nvals = 2 ** np.arange(6, e) + rsn = np.mean([nolds.hurst_rs(np.random.normal(size=l), fit="poly") for _ in range(n)]) + rs50 = np.mean( + [nolds.hurst_rs(np.random.normal(size=l), fit="poly", nvals=nvals) for _ in range(n)] + ) + rs50_raw = np.mean( + [ + nolds.hurst_rs( + np.random.normal(size=l), + fit="poly", + nvals=nvals, + corrected=False, + ) + for _ in range(n) + ] + ) + data.append((rsn, rs50, rs50_raw)) + lines = plt.plot(np.arange(8, 17), data) + r = plt.plot(np.arange(8, 17), reported) + rr = plt.plot(np.arange(8, 17), reported_raw) + plt.legend(r + rr + lines, ("weron", "weron_raw", "rsn", "rs50", "rs50_raw")) + plt.xticks(np.arange(8, 17), 2 ** np.arange(8, 17)) + plt.xlabel("sequence length") + plt.ylabel("estimated hurst exponent") + plt.show() def plot_hurst_hist() -> None: - """Plots a histogram of values obtained for the hurst exponent of uniformly - distributed white noise. - - This function requires the package ``matplotlib``. - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - hs = [ - nolds.hurst_rs(np.random.random(size=10000), corrected=True) - for _ in range(100) - ] - plt.hist(hs, bins=20) - plt.xlabel("esimated value of hurst exponent") - plt.ylabel("number of experiments") - plt.show() + """Plots a histogram of values obtained for the hurst exponent of uniformly + distributed white noise. + + This function requires the package ``matplotlib``. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + hs = [nolds.hurst_rs(np.random.random(size=10000), corrected=True) for _ in range(100)] + plt.hist(hs, bins=20) + plt.xlabel("esimated value of hurst exponent") + plt.ylabel("number of experiments") + plt.show() def plot_lyap(maptype="logistic") -> None: - """Plots a bifurcation plot of the given map and superimposes the true - lyapunov exponent as well as the estimates of the largest lyapunov exponent - obtained by ``lyap_r`` and ``lyap_e``. The idea for this plot is taken - from [ll]_. - - This function requires the package ``matplotlib``. - - References: - - .. [ll] Manfred Füllsack, "Lyapunov exponent", - url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - - Kwargs: - maptype (str): - can be either ``"logistic"`` for the logistic map or ``"tent"`` for the - tent map. - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - - x_start = 0.1 - n = 140 - nbifur = 40 - if maptype == "logistic": - param_name = "r" - param_range = np.arange(2, 4, 0.01) - full_data = np.array([ - np.fromiter(datasets.logistic_map(x_start, n, r), dtype="float32") - for r in param_range - ]) - # It can be proven that the lyapunov exponent of the logistic map - # (or any map that is an iterative application of a function) can be - # calculated as the mean of the logarithm of the absolute of the - # derivative at the individual data points. - # For a proof see for example: - # https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ - # Derivative of logistic map: f(x) = r * x * (1 - x) = r * x - r * x² - # => f'(x) = r - 2 * r * x - lambdas = [ - np.mean(np.log(abs(r - 2 * r * x[np.where(x != 0.5)]))) - for x, r in zip(full_data, param_range) - ] - elif maptype == "tent": - param_name = "$\\mu$" - param_range = np.arange(0, 2, 0.01) - full_data = np.array([ - np.fromiter(datasets.tent_map(x_start, n, mu), dtype="float32") - for mu in param_range - ]) - # for the tent map the lyapunov exponent is much easier to calculate - # since the values are multiplied by mu in each step, two trajectories - # starting in x and x + delta will have a distance of delta * mu^n after n - # steps. Therefore the lyapunov exponent should be log(mu). - lambdas = np.log(param_range, where=param_range > 0) - lambdas[np.where(param_range <= 0)] = np.nan - else: - msg = f"maptype {maptype} not recognized" - raise Error(msg) - - kwargs_e = {"emb_dim": 6, "matrix_dim": 2} - kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} - lambdas_e = [max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] - lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] - bifur_x = np.repeat(param_range, nbifur) - bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) - - plt.title(f"Lyapunov exponent of the {maptype} map") - plt.plot(param_range, lambdas, "b-", label="true lyap. exponent") - elab = "estimation using lyap_e" - rlab = "estimation using lyap_r" - plt.plot(param_range, lambdas_e, color="#00AAAA", label=elab) - plt.plot(param_range, lambdas_r, color="#AA00AA", label=rlab) - plt.plot(param_range, np.zeros(len(param_range)), "g--") - plt.plot(bifur_x, bifur, "ro", alpha=0.1, label="bifurcation plot") - plt.ylim((-2, 2)) - plt.xlabel(param_name) - plt.ylabel(f"lyap. exp / {maptype}(x, {param_name})") - plt.legend(loc="best") - plt.show() + """Plots a bifurcation plot of the given map and superimposes the true + lyapunov exponent as well as the estimates of the largest lyapunov exponent + obtained by ``lyap_r`` and ``lyap_e``. The idea for this plot is taken + from [ll]_. + + This function requires the package ``matplotlib``. + + References: + + .. [ll] Manfred Füllsack, "Lyapunov exponent", + url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html + + Kwargs: + maptype (str): + can be either ``"logistic"`` for the logistic map or ``"tent"`` for the + tent map. + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + x_start = 0.1 + n = 140 + nbifur = 40 + if maptype == "logistic": + param_name = "r" + param_range = np.arange(2, 4, 0.01) + full_data = np.array( + [ + np.fromiter(datasets.logistic_map(x_start, n, r), dtype="float32") + for r in param_range + ] + ) + # It can be proven that the lyapunov exponent of the logistic map + # (or any map that is an iterative application of a function) can be + # calculated as the mean of the logarithm of the absolute of the + # derivative at the individual data points. + # For a proof see for example: + # https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ + # Derivative of logistic map: f(x) = r * x * (1 - x) = r * x - r * x² + # => f'(x) = r - 2 * r * x + lambdas = [ + np.mean(np.log(abs(r - 2 * r * x[np.where(x != 0.5)]))) + for x, r in zip(full_data, param_range) + ] + elif maptype == "tent": + param_name = "$\\mu$" + param_range = np.arange(0, 2, 0.01) + full_data = np.array( + [np.fromiter(datasets.tent_map(x_start, n, mu), dtype="float32") for mu in param_range] + ) + # for the tent map the lyapunov exponent is much easier to calculate + # since the values are multiplied by mu in each step, two trajectories + # starting in x and x + delta will have a distance of delta * mu^n after n + # steps. Therefore the lyapunov exponent should be log(mu). + lambdas = np.log(param_range, where=param_range > 0) + lambdas[np.where(param_range <= 0)] = np.nan + else: + msg = f"maptype {maptype} not recognized" + raise Error(msg) + + kwargs_e = {"emb_dim": 6, "matrix_dim": 2} + kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} + lambdas_e = [max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] + lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] + bifur_x = np.repeat(param_range, nbifur) + bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) + + plt.title(f"Lyapunov exponent of the {maptype} map") + plt.plot(param_range, lambdas, "b-", label="true lyap. exponent") + elab = "estimation using lyap_e" + rlab = "estimation using lyap_r" + plt.plot(param_range, lambdas_e, color="#00AAAA", label=elab) + plt.plot(param_range, lambdas_r, color="#AA00AA", label=rlab) + plt.plot(param_range, np.zeros(len(param_range)), "g--") + plt.plot(bifur_x, bifur, "ro", alpha=0.1, label="bifurcation plot") + plt.ylim((-2, 2)) + plt.xlabel(param_name) + plt.ylabel(f"lyap. exp / {maptype}(x, {param_name})") + plt.legend(loc="best") + plt.show() def profiling() -> None: - """Runs a profiling test for the function ``lyap_e`` (mainly used for - development). + """Runs a profiling test for the function ``lyap_e`` (mainly used for + development). - This function requires the package ``cProfile``. - """ - import cProfile - n = 10000 - data = np.cumsum(np.random.random(n) - 0.5) - cProfile.runctx("lyap_e(data)", {"lyap_e": nolds.lyap_e}, {"data": data}) + This function requires the package ``cProfile``. + """ + import cProfile + + n = 10000 + data = np.cumsum(np.random.random(n) - 0.5) + cProfile.runctx("lyap_e(data)", {"lyap_e": nolds.lyap_e}, {"data": data}) def hurst_compare_nvals(data, nvals=None) -> None: - """Creates a plot that compares the results of different choices for nvals - for the function hurst_rs. - - Args: - data (array-like of float): - the input data from which the hurst exponent should be estimated - - Kwargs: - nvals (array of int): - a manually selected value for the nvals parameter that should be plotted - in comparison to the default choices - """ - import matplotlib.pyplot as plt - data = np.asarray(data) - n_all = np.arange(2, len(data)+1) - dd_all = nolds.hurst_rs(data, nvals=n_all, debug_data=True, fit="poly") - dd_def = nolds.hurst_rs(data, debug_data=True, fit="poly") - n_def = np.round(np.exp(dd_def[1][0])).astype("int32") - n_div = n_all[np.where(len(data) % n_all[:-1] == 0)] - dd_div = nolds.hurst_rs(data, nvals=n_div, debug_data=True, fit="poly") - - def corr(nvals): - return [np.log(nolds.expected_rs(n)) for n in nvals] - - l_all = plt.plot(dd_all[1][0], dd_all[1][1] - corr(n_all), "o") - l_def = plt.plot(dd_def[1][0], dd_def[1][1] - corr(n_def), "o") - l_div = plt.plot(dd_div[1][0], dd_div[1][1] - corr(n_div), "o") - l_cst = [] - t_cst = [] - - if nvals is not None: - dd_cst = nolds.hurst_rs(data, nvals=nvals, debug_data=True, fit="poly") - l_cst = plt.plot(dd_cst[1][0], dd_cst[1][1] - corr(nvals), "o") - t_cst = ["custom"] - plt.xlabel("log(n)") - plt.ylabel("log((R/S)_n - E[(R/S)_n])") - plt.legend( - l_all + l_def + l_div + l_cst, ["all", "default", "divisors", *t_cst], - ) - labeled_data = zip([dd_all[0], dd_def[0], dd_div[0]], ["all", "def", "div"]) - for data, _label in labeled_data: - pass - if nvals is not None: - pass - plt.show() + """Creates a plot that compares the results of different choices for nvals + for the function hurst_rs. + + Args: + data (array-like of float): + the input data from which the hurst exponent should be estimated + + Kwargs: + nvals (array of int): + a manually selected value for the nvals parameter that should be plotted + in comparison to the default choices + """ + import matplotlib.pyplot as plt + + data = np.asarray(data) + n_all = np.arange(2, len(data) + 1) + dd_all = nolds.hurst_rs(data, nvals=n_all, debug_data=True, fit="poly") + dd_def = nolds.hurst_rs(data, debug_data=True, fit="poly") + n_def = np.round(np.exp(dd_def[1][0])).astype("int32") + n_div = n_all[np.where(len(data) % n_all[:-1] == 0)] + dd_div = nolds.hurst_rs(data, nvals=n_div, debug_data=True, fit="poly") + + def corr(nvals): + return [np.log(nolds.expected_rs(n)) for n in nvals] + + l_all = plt.plot(dd_all[1][0], dd_all[1][1] - corr(n_all), "o") + l_def = plt.plot(dd_def[1][0], dd_def[1][1] - corr(n_def), "o") + l_div = plt.plot(dd_div[1][0], dd_div[1][1] - corr(n_div), "o") + l_cst = [] + t_cst = [] + + if nvals is not None: + dd_cst = nolds.hurst_rs(data, nvals=nvals, debug_data=True, fit="poly") + l_cst = plt.plot(dd_cst[1][0], dd_cst[1][1] - corr(nvals), "o") + t_cst = ["custom"] + plt.xlabel("log(n)") + plt.ylabel("log((R/S)_n - E[(R/S)_n])") + plt.legend( + l_all + l_def + l_div + l_cst, + ["all", "default", "divisors", *t_cst], + ) + labeled_data = zip([dd_all[0], dd_def[0], dd_div[0]], ["all", "def", "div"]) + for data, _label in labeled_data: + pass + if nvals is not None: + pass + plt.show() + def sampen_default_tolerance() -> None: - data = list(datasets.logistic_map(0.34, 1000, r=3.9)) - oldtol = 0.2 * np.std(data, ddof=1) - old_res = [ - nolds.sampen(data, emb_dim=i, tolerance=oldtol) - for i in range(1, 30) - ] - new_res = [ - nolds.sampen(data, emb_dim=i) - for i in range(1, 30) - ] - for _i, _old, _new in zip(range(1, 30), old_res, new_res): - pass + data = list(datasets.logistic_map(0.34, 1000, r=3.9)) + oldtol = 0.2 * np.std(data, ddof=1) + old_res = [nolds.sampen(data, emb_dim=i, tolerance=oldtol) for i in range(1, 30)] + new_res = [nolds.sampen(data, emb_dim=i) for i in range(1, 30)] + for _i, _old, _new in zip(range(1, 30), old_res, new_res): + pass + def aste_line_fitting(N=100) -> None: - """Shows plot that proves that the line fitting in T. Astes original MATLAB code - provides the same results as `np.polyfit`. - """ - slope = np.random.random() * 10 - 5 - intercept = np.random.random() * 100 - 50 - xvals = np.arange(N) - yvals = xvals * slope + intercept + np.random.randn(N)*100 - import matplotlib.pyplot as plt - plt.plot(xvals, yvals, "rx", label="data") - plt.plot( - [0, N-1], [intercept, intercept + slope * (N-1)], - "r-", label=f"true ({slope:.3f} x + {intercept:.3f})", alpha=0.5, - ) - i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) - s_np, i_np = np.polyfit(xvals, yvals, 1) - plt.plot( - [0, N-1], [i_aste, i_aste + s_aste * (N-1)], - "b-", label=f"aste ({s_aste:.3f} x + {i_aste:.3f})", alpha=0.5, - ) - plt.plot( - [0, N-1], [i_np, i_np + s_np * (N-1)], - "g-", label=f"numpy ({s_np:.3f} x + {i_np:.3f})", alpha=0.5, - ) - plt.legend() - plt.show() + """Shows plot that proves that the line fitting in T. Astes original MATLAB code + provides the same results as `np.polyfit`. + """ + slope = np.random.random() * 10 - 5 + intercept = np.random.random() * 100 - 50 + xvals = np.arange(N) + yvals = xvals * slope + intercept + np.random.randn(N) * 100 + import matplotlib.pyplot as plt + + plt.plot(xvals, yvals, "rx", label="data") + plt.plot( + [0, N - 1], + [intercept, intercept + slope * (N - 1)], + "r-", + label=f"true ({slope:.3f} x + {intercept:.3f})", + alpha=0.5, + ) + i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) + s_np, i_np = np.polyfit(xvals, yvals, 1) + plt.plot( + [0, N - 1], + [i_aste, i_aste + s_aste * (N - 1)], + "b-", + label=f"aste ({s_aste:.3f} x + {i_aste:.3f})", + alpha=0.5, + ) + plt.plot( + [0, N - 1], + [i_np, i_np + s_np * (N - 1)], + "g-", + label=f"numpy ({s_np:.3f} x + {i_np:.3f})", + alpha=0.5, + ) + plt.legend() + plt.show() def hurst_mf_stock(debug=False) -> None: - """Recreates results from [mfs_1]_ (table at start of section 4) as print - output. - - Unfortunately as a layman in finance, I could not determine the exact data - that Di Matteo et al. used. Instead I use the data from - `nolds.datasets.load_financial()`. - - Plots H(2) for the following datasets and algorithms. - - Datasets (opening values from `load_financial()`): - - - jkse: Jakarta Composite Index - - n225: Nikkei 225 - - ndx: NASDAQ 100 - - Algorithms: - - - mfhurst_b: GHE according to Barabási et al. - - mfhurst_b + dt: like mfhurst_b, but with linear detrending performed first - - mfhurst_dm: GHE according to Di Matteo et al. (should be identical to - _genhurst) - - _genhurst: GHE according to translated MATLAB code by T. Aste (one of the - co-authors of Di Matteo). - - References: - - .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. - - Kwargs: - debug (boolean): - if `True`, a debug plot will be shown for each calculated GHE value - except for the ones generated by `_genhurst`. - """ - financial = [ - (datasets.jkse, "jkse"), (datasets.n225, "n225"), (datasets.ndx, "ndx"), - ] - for data, _lab in financial: - data = data[1][:, 0] - data = np.log(data) - dists = range(1, 20) - nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] - nolds.mfhurst_b( - nolds.detrend_data(data, order=1), - qvals=[2], dists=dists, debug_plot=debug, - )[0] - nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] - nolds._genhurst(data, 2) + """Recreates results from [mfs_1]_ (table at start of section 4) as print + output. + + Unfortunately as a layman in finance, I could not determine the exact data + that Di Matteo et al. used. Instead I use the data from + `nolds.datasets.load_financial()`. + + Plots H(2) for the following datasets and algorithms. + + Datasets (opening values from `load_financial()`): + + - jkse: Jakarta Composite Index + - n225: Nikkei 225 + - ndx: NASDAQ 100 + + Algorithms: + + - mfhurst_b: GHE according to Barabási et al. + - mfhurst_b + dt: like mfhurst_b, but with linear detrending performed first + - mfhurst_dm: GHE according to Di Matteo et al. (should be identical to + _genhurst) + - _genhurst: GHE according to translated MATLAB code by T. Aste (one of the + co-authors of Di Matteo). + + References: + + .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. + + Kwargs: + debug (boolean): + if `True`, a debug plot will be shown for each calculated GHE value + except for the ones generated by `_genhurst`. + """ + financial = [ + (datasets.jkse, "jkse"), + (datasets.n225, "n225"), + (datasets.ndx, "ndx"), + ] + for data, _lab in financial: + data = data[1][:, 0] + data = np.log(data) + dists = range(1, 20) + nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] + nolds.mfhurst_b( + nolds.detrend_data(data, order=1), + qvals=[2], + dists=dists, + debug_plot=debug, + )[0] + nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] + nolds._genhurst(data, 2) def barabasi_1991_figure2() -> None: - """Recreates figure 2 from [bf2]_. - - This figure compares calculated and estimated values for H(q) for - a fractal generated by 9 iterations of the `barabasi1991_fractal` function - with b1 = 0.8 and b2 = 0.5. - - References: - .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - """ - import matplotlib.pyplot as plt - b1991 = datasets.barabasi1991_fractal(10000000, 9) - qvals = range(1, 11) - qvals_t = range(-10, 11) - b1 = 0.8 - b2 = 0.5 - dists = [4 ** i for i in range(6, 11)] - # dists = nolds.logarithmic_n(100, 0.01 * len(b1991), 2) - Hq = nolds.mfhurst_b(b1991, qvals=qvals, dists=dists) - Hq_t = [np.log((b1 ** q + b2 ** q) / 2) / np.log(0.25) / q for q in qvals_t] - plt.plot(qvals, Hq, "r+", label="mfhurst_b") - plt.plot(qvals_t, Hq_t, label="calculated value") - plt.legend(loc="best") - plt.xlabel("q") - plt.ylabel("H(q)") - plt.show() + """Recreates figure 2 from [bf2]_. + + This figure compares calculated and estimated values for H(q) for + a fractal generated by 9 iterations of the `barabasi1991_fractal` function + with b1 = 0.8 and b2 = 0.5. + + References: + .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + """ + import matplotlib.pyplot as plt + + b1991 = datasets.barabasi1991_fractal(10000000, 9) + qvals = range(1, 11) + qvals_t = range(-10, 11) + b1 = 0.8 + b2 = 0.5 + dists = [4**i for i in range(6, 11)] + # dists = nolds.logarithmic_n(100, 0.01 * len(b1991), 2) + Hq = nolds.mfhurst_b(b1991, qvals=qvals, dists=dists) + Hq_t = [np.log((b1**q + b2**q) / 2) / np.log(0.25) / q for q in qvals_t] + plt.plot(qvals, Hq, "r+", label="mfhurst_b") + plt.plot(qvals_t, Hq_t, label="calculated value") + plt.legend(loc="best") + plt.xlabel("q") + plt.ylabel("H(q)") + plt.show() def barabasi_1991_figure3() -> None: - """Recreates figure 3 from [bf3]_. - - This figure compares calculated and estimated values for H(q) for a simple - Brownian motion that moves in unit steps (-1 or +1) in each time step. - - References: - .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - """ - import matplotlib.pyplot as plt - brown = np.cumsum(np.random.randint(0, 2, size=10000000)*2-1) - qvals = [-5, -4, -3, -2, -1.1, 0.1, 1, 2, 3, 4, 5] - Hq_t = [0.5 if q > -1 else -0.5/q for q in qvals] - dists = [2 ** i for i in range(6, 15)] - # dists = nolds.logarithmic_n(100, 0.01 * len(brown), 1.5) - Hq = nolds.mfhurst_b(brown, qvals=qvals, dists=dists, debug_plot=False) - plt.plot(qvals, Hq, "r+", label="mfhurst_b") - plt.plot(qvals, Hq_t, label="calculated value") - plt.ylim(0, 1) - plt.legend(loc="best") - plt.xlabel("q") - plt.ylabel("H(q)") - plt.show() + """Recreates figure 3 from [bf3]_. + + This figure compares calculated and estimated values for H(q) for a simple + Brownian motion that moves in unit steps (-1 or +1) in each time step. + + References: + .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + """ + import matplotlib.pyplot as plt + + brown = np.cumsum(np.random.randint(0, 2, size=10000000) * 2 - 1) + qvals = [-5, -4, -3, -2, -1.1, 0.1, 1, 2, 3, 4, 5] + Hq_t = [0.5 if q > -1 else -0.5 / q for q in qvals] + dists = [2**i for i in range(6, 15)] + # dists = nolds.logarithmic_n(100, 0.01 * len(brown), 1.5) + Hq = nolds.mfhurst_b(brown, qvals=qvals, dists=dists, debug_plot=False) + plt.plot(qvals, Hq, "r+", label="mfhurst_b") + plt.plot(qvals, Hq_t, label="calculated value") + plt.ylim(0, 1) + plt.legend(loc="best") + plt.xlabel("q") + plt.ylabel("H(q)") + plt.show() def lorenz() -> None: - """Calculates different measures for the Lorenz system of ordinary - differential equations and compares nolds results with prescribed - results from the literature. - - The Lorenz system is a three dimensional dynamical system given - by the following equations: - - dx/dt = sigma * (y - x) - dy/dt = rho * x - y - x * z - dz/dt = x * y - beta * z - - To test the reconstruction of higher-dimensional phenomena from - one-dimensional data, the lorenz system is simulated with a - simple Euler method and then the x-, y-, and z-values are used - as one-dimensional input for the nolds algorithms. - - Parameters for Lorenz system: - - - sigma = 10 - - rho = 28 - - beta = 8/3 - - dt = 0.012 - - Algorithms: - - - ``lyap_r`` with min_tsep=1000, emb_dim=5, tau=0.01, and lag=5 (see [l_4]_) - - ``lyap_e`` with min_tsep=1000, emb_dim=5, matrix_dim=5, and tau=0.01 (see [l_4]_) - - ``corr_dim`` with emb_dim=10, and fit=poly (see [l_1]_) - - ``hurst_rs`` with fit=poly (see [l_3]_) - - ``dfa`` with default parameters (see [l_5]_) - - ``sampen`` with default parameters (see [l_2]_) - - References: - - .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness - of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, - “The effect of time delay on Approximate & Sample Entropy - calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, - pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. - .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series - Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, - 2009, doi: 10.1007/s11207-009-9467-x. - .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and - similarities in the analysis of Lorenz, Chen, and Lu systems,” - Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, - doi: 10.1016/j.amc.2014.12.132. - .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, - and M. Dietz, “A Multivariate Method for Dynamic System Analysis: - Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” - Topics in Cognitive Science, p. tops.12688, Sep. 2023, - doi: 10.1111/tops.12688. - - - """ - sigma = 10 - rho = 28 - beta = 8.0/3 - start = [0, 22, 10] - n = 10000 - skip = 10000 - dt = 0.012 - data = datasets.lorenz_euler(n + skip, sigma, rho, beta, start=start, dt=dt)[skip:] - - # fig = plt.figure() - # ax = fig.add_subplot(111, projection="3d") - # ax.plot(data[:, 0], data[:, 1], data[:, 2]) - # plt.show() - # plt.close(fig) - - datasets.lorenz_lyap(sigma, rho, beta) - # Rationale for argument values: - # start with medium settings for min_tsep and lag, span a large area with trajectory_len, set fit_offset to 0 - # up the embedding dimension until you get a clear line in the debug plot - # adjust trajectory_len and fit_offset to split off only the linear part - # in general: the longer the linear part of the plot, the better - lyap_r_args = {"min_tsep": 10, "emb_dim": 5, "tau": dt, "lag": 5, "trajectory_len": 28, "fit_offset": 8, "fit": "poly"} - nolds.lyap_r(data[:, 0], **lyap_r_args) - nolds.lyap_r(data[:, 1], **lyap_r_args) - nolds.lyap_r(data[:, 2], **lyap_r_args) - # Rationale for argument values: - # Start with emb_dim=matrix_dim, medium min_tsep and min_nb - # After that, no good guidelines for stability. :( - # -> Just experiment with settings until you get close to expected value. ¯\_(ツ)_/¯ - # NOTE: It seems from this example and `lyapunov-logistic` that lyap_e has a scaling problem. - lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": dt, "min_nb": 8} - nolds.lyap_e(data[:, 0], **lyap_e_args) - nolds.lyap_e(data[:, 1], **lyap_e_args) - nolds.lyap_e(data[:, 2], **lyap_e_args) - - # Rationale for argument values: - # Start with moderate settings for lag and a large span of rvals. - # Increase emb_dim until you get a clear line in the debug plot - # Clip rvals to select only the linear part of the plot. - # Increase lag as long as it increases the output. Stop when the output becomes smaller - # (or when you feel that the lag is unreasonably large.) - rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - corr_dim_args = {"emb_dim": 5, "lag": 10, "fit": "poly", "rvals": rvals} - nolds.corr_dim(data[:, 0], **corr_dim_args) - nolds.corr_dim(data[:, 1], **corr_dim_args) - nolds.corr_dim(data[:, 2], **corr_dim_args) - # reference Grassberger-Procaccia 1983 - - # Rationale for argument values: - # Start with a large range of nvals. - # Reduce those down cutting of the first few data points and then only keep the - # linear-ish looking part of the initial rise. - hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} - nolds.hurst_rs(data[:, 0], **hurst_rs_args) - nolds.hurst_rs(data[:, 1], **hurst_rs_args) - nolds.hurst_rs(data[:, 2], **hurst_rs_args) - # reference: Suyal 2009 - - # reference: Wallot 2023, Table 1 - # Rationale for argument values: Just follow paper - # NOTE since DFA is quite fast and Wallot 2023 use different initial values - # (x = y = z = 0.1 + e) and size of data (100k data points, 1000 runs) and - # don't report step size, we use different data here - data_dfa = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] - nvals = nolds.logarithmic_n(200, len(data_dfa)/8, 2**0.2) - dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} - nolds.dfa(data_dfa[:, 0], **dfa_args) - nolds.dfa(data_dfa[:, 1], **dfa_args) - nolds.dfa(data_dfa[:, 2], **dfa_args) - - # reference: Kaffashi 2008 - # Rationale for argument values: Just follow paper. - sampen_args = {"emb_dim": 2, "lag": 1} - nolds.sampen(data[:, 0], **sampen_args) - nolds.sampen(data[:, 1], **sampen_args) - nolds.sampen(data[:, 2], **sampen_args) + """Calculates different measures for the Lorenz system of ordinary + differential equations and compares nolds results with prescribed + results from the literature. + + The Lorenz system is a three dimensional dynamical system given + by the following equations: + + dx/dt = sigma * (y - x) + dy/dt = rho * x - y - x * z + dz/dt = x * y - beta * z + + To test the reconstruction of higher-dimensional phenomena from + one-dimensional data, the lorenz system is simulated with a + simple Euler method and then the x-, y-, and z-values are used + as one-dimensional input for the nolds algorithms. + + Parameters for Lorenz system: + + - sigma = 10 + - rho = 28 + - beta = 8/3 + - dt = 0.012 + + Algorithms: + + - ``lyap_r`` with min_tsep=1000, emb_dim=5, tau=0.01, and lag=5 (see [l_4]_) + - ``lyap_e`` with min_tsep=1000, emb_dim=5, matrix_dim=5, and tau=0.01 (see [l_4]_) + - ``corr_dim`` with emb_dim=10, and fit=poly (see [l_1]_) + - ``hurst_rs`` with fit=poly (see [l_3]_) + - ``dfa`` with default parameters (see [l_5]_) + - ``sampen`` with default parameters (see [l_2]_) + + References: + + .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness + of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, + “The effect of time delay on Approximate & Sample Entropy + calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, + pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. + .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series + Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, + 2009, doi: 10.1007/s11207-009-9467-x. + .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and + similarities in the analysis of Lorenz, Chen, and Lu systems,” + Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, + doi: 10.1016/j.amc.2014.12.132. + .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, + and M. Dietz, “A Multivariate Method for Dynamic System Analysis: + Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” + Topics in Cognitive Science, p. tops.12688, Sep. 2023, + doi: 10.1111/tops.12688. + + + """ + sigma = 10 + rho = 28 + beta = 8.0 / 3 + start = [0, 22, 10] + n = 10000 + skip = 10000 + dt = 0.012 + data = datasets.lorenz_euler(n + skip, sigma, rho, beta, start=start, dt=dt)[skip:] + + # fig = plt.figure() + # ax = fig.add_subplot(111, projection="3d") + # ax.plot(data[:, 0], data[:, 1], data[:, 2]) + # plt.show() + # plt.close(fig) + + datasets.lorenz_lyap(sigma, rho, beta) + # Rationale for argument values: + # start with medium settings for min_tsep and lag, span a large area with trajectory_len, set fit_offset to 0 + # up the embedding dimension until you get a clear line in the debug plot + # adjust trajectory_len and fit_offset to split off only the linear part + # in general: the longer the linear part of the plot, the better + lyap_r_args = { + "min_tsep": 10, + "emb_dim": 5, + "tau": dt, + "lag": 5, + "trajectory_len": 28, + "fit_offset": 8, + "fit": "poly", + } + nolds.lyap_r(data[:, 0], **lyap_r_args) + nolds.lyap_r(data[:, 1], **lyap_r_args) + nolds.lyap_r(data[:, 2], **lyap_r_args) + # Rationale for argument values: + # Start with emb_dim=matrix_dim, medium min_tsep and min_nb + # After that, no good guidelines for stability. :( + # -> Just experiment with settings until you get close to expected value. ¯\_(ツ)_/¯ + # NOTE: It seems from this example and `lyapunov-logistic` that lyap_e has a scaling problem. + lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": dt, "min_nb": 8} + nolds.lyap_e(data[:, 0], **lyap_e_args) + nolds.lyap_e(data[:, 1], **lyap_e_args) + nolds.lyap_e(data[:, 2], **lyap_e_args) + + # Rationale for argument values: + # Start with moderate settings for lag and a large span of rvals. + # Increase emb_dim until you get a clear line in the debug plot + # Clip rvals to select only the linear part of the plot. + # Increase lag as long as it increases the output. Stop when the output becomes smaller + # (or when you feel that the lag is unreasonably large.) + rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally + corr_dim_args = {"emb_dim": 5, "lag": 10, "fit": "poly", "rvals": rvals} + nolds.corr_dim(data[:, 0], **corr_dim_args) + nolds.corr_dim(data[:, 1], **corr_dim_args) + nolds.corr_dim(data[:, 2], **corr_dim_args) + # reference Grassberger-Procaccia 1983 + + # Rationale for argument values: + # Start with a large range of nvals. + # Reduce those down cutting of the first few data points and then only keep the + # linear-ish looking part of the initial rise. + hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} + nolds.hurst_rs(data[:, 0], **hurst_rs_args) + nolds.hurst_rs(data[:, 1], **hurst_rs_args) + nolds.hurst_rs(data[:, 2], **hurst_rs_args) + # reference: Suyal 2009 + + # reference: Wallot 2023, Table 1 + # Rationale for argument values: Just follow paper + # NOTE since DFA is quite fast and Wallot 2023 use different initial values + # (x = y = z = 0.1 + e) and size of data (100k data points, 1000 runs) and + # don't report step size, we use different data here + data_dfa = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ + 20000: + ] + nvals = nolds.logarithmic_n(200, len(data_dfa) / 8, 2**0.2) + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} + nolds.dfa(data_dfa[:, 0], **dfa_args) + nolds.dfa(data_dfa[:, 1], **dfa_args) + nolds.dfa(data_dfa[:, 2], **dfa_args) + + # reference: Kaffashi 2008 + # Rationale for argument values: Just follow paper. + sampen_args = {"emb_dim": 2, "lag": 1} + nolds.sampen(data[:, 0], **sampen_args) + nolds.sampen(data[:, 1], **sampen_args) + nolds.sampen(data[:, 2], **sampen_args) if __name__ == "__main__": - # run this with the following command: - # python -m nolds.examples lyapunov-logistic - import sys - - def print_options() -> None: - pass - if len(sys.argv) < 2: - print_options() - elif sys.argv[1] == "lyapunov-logistic": - plot_lyap() - elif sys.argv[1] == "lyapunov-tent": - plot_lyap("tent") - elif sys.argv[1] == "profiling": - profiling() - elif sys.argv[1] == "hurst-weron2": - n = 1000 if len(sys.argv) < 3 else int(sys.argv[2]) - weron_2002_figure2(n) - elif sys.argv[1] == "hurst-hist": - plot_hurst_hist() - elif sys.argv[1] == "hurst-nvals": - hurst_compare_nvals(datasets.brown72) - elif sys.argv[1] == "sampen-tol": - sampen_default_tolerance() - elif sys.argv[1] == "aste-line": - aste_line_fitting() - elif sys.argv[1] == "hurst-mf-stock": - hurst_mf_stock() - elif sys.argv[1] == "hurst-mf-barabasi2": - barabasi_1991_figure2() - elif sys.argv[1] == "hurst-mf-barabasi3": - barabasi_1991_figure3() - elif sys.argv[1] == "lorenz": - lorenz() - else: - print_options() + # run this with the following command: + # python -m nolds.examples lyapunov-logistic + import sys + + def print_options() -> None: + pass + + if len(sys.argv) < 2: + print_options() + elif sys.argv[1] == "lyapunov-logistic": + plot_lyap() + elif sys.argv[1] == "lyapunov-tent": + plot_lyap("tent") + elif sys.argv[1] == "profiling": + profiling() + elif sys.argv[1] == "hurst-weron2": + n = 1000 if len(sys.argv) < 3 else int(sys.argv[2]) + weron_2002_figure2(n) + elif sys.argv[1] == "hurst-hist": + plot_hurst_hist() + elif sys.argv[1] == "hurst-nvals": + hurst_compare_nvals(datasets.brown72) + elif sys.argv[1] == "sampen-tol": + sampen_default_tolerance() + elif sys.argv[1] == "aste-line": + aste_line_fitting() + elif sys.argv[1] == "hurst-mf-stock": + hurst_mf_stock() + elif sys.argv[1] == "hurst-mf-barabasi2": + barabasi_1991_figure2() + elif sys.argv[1] == "hurst-mf-barabasi3": + barabasi_1991_figure3() + elif sys.argv[1] == "lorenz": + lorenz() + else: + print_options() diff --git a/nolds/measures.py b/nolds/measures.py index 52ce1f7..417aecf 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -5,1529 +5,1613 @@ def rowwise_chebyshev(x, y): - return np.max(np.abs(x - y), axis=1) + return np.max(np.abs(x - y), axis=1) def rowwise_euclidean(x, y): - return np.sqrt(np.sum((x - y)**2, axis=1)) + return np.sqrt(np.sum((x - y) ** 2, axis=1)) def poly_fit(x, y, degree, fit="RANSAC"): - # check if we can use RANSAC - if fit == "RANSAC": - try: - # ignore ImportWarnings in sklearn - with warnings.catch_warnings(): - warnings.simplefilter("ignore", ImportWarning) - import sklearn.linear_model as sklin - import sklearn.preprocessing as skpre - except ImportError: - warnings.warn( - "fitting mode 'RANSAC' requires the package sklearn, using" - " 'poly' instead", - RuntimeWarning, stacklevel=2) - fit = "poly" - - if fit == "poly": - return np.polyfit(x, y, degree) - if fit == "RANSAC": - model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) - xdat = np.asarray(x) - if len(xdat.shape) == 1: - # interpret 1d-array as list of len(x) samples instead of - # one sample of length len(x) - xdat = xdat.reshape(-1, 1) - polydat = skpre.PolynomialFeatures(degree).fit_transform(xdat) - try: - model.fit(polydat, y) - coef = model.estimator_.coef_[::-1] - except ValueError: - warnings.warn( - "RANSAC did not reach consensus, " - "using numpy's polyfit", - RuntimeWarning, stacklevel=2) - coef = np.polyfit(x, y, degree) - return coef - msg = f"invalid fitting mode ({fit})" - raise ValueError(msg) + # check if we can use RANSAC + if fit == "RANSAC": + try: + # ignore ImportWarnings in sklearn + with warnings.catch_warnings(): + warnings.simplefilter("ignore", ImportWarning) + import sklearn.linear_model as sklin + import sklearn.preprocessing as skpre + except ImportError: + warnings.warn( + "fitting mode 'RANSAC' requires the package sklearn, using 'poly' instead", + RuntimeWarning, + stacklevel=2, + ) + fit = "poly" + + if fit == "poly": + return np.polyfit(x, y, degree) + if fit == "RANSAC": + model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) + xdat = np.asarray(x) + if len(xdat.shape) == 1: + # interpret 1d-array as list of len(x) samples instead of + # one sample of length len(x) + xdat = xdat.reshape(-1, 1) + polydat = skpre.PolynomialFeatures(degree).fit_transform(xdat) + try: + model.fit(polydat, y) + coef = model.estimator_.coef_[::-1] + except ValueError: + warnings.warn( + "RANSAC did not reach consensus, using numpy's polyfit", + RuntimeWarning, + stacklevel=2, + ) + coef = np.polyfit(x, y, degree) + return coef + msg = f"invalid fitting mode ({fit})" + raise ValueError(msg) def delay_embedding(data, emb_dim, lag=1): - """Perform a time-delay embedding of a time series. - - Args: - data (array-like): - the data that should be embedded - emb_dim (int): - the embedding dimension - Kwargs: - lag (int): - the lag between elements in the embedded vectors - - Returns: - emb_dim x m array: - matrix of embedded vectors of the form - [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] - for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) - """ - data = np.asarray(data) - min_len = (emb_dim - 1) * lag + 1 - if len(data) < min_len: - msg = "cannot embed data of length {} with embedding dimension {} " \ - "and lag {}, minimum required length is {}" - raise ValueError(msg.format(len(data), emb_dim, lag, min_len)) - m = len(data) - min_len + 1 - indices = np.repeat([np.arange(emb_dim) * lag], m, axis=0) - indices += np.arange(m).reshape((m, 1)) - return data[indices] + """Perform a time-delay embedding of a time series. + + Args: + data (array-like): + the data that should be embedded + emb_dim (int): + the embedding dimension + Kwargs: + lag (int): + the lag between elements in the embedded vectors + + Returns: + emb_dim x m array: + matrix of embedded vectors of the form + [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] + for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) + """ + data = np.asarray(data) + min_len = (emb_dim - 1) * lag + 1 + if len(data) < min_len: + msg = ( + "cannot embed data of length {} with embedding dimension {} " + "and lag {}, minimum required length is {}" + ) + raise ValueError(msg.format(len(data), emb_dim, lag, min_len)) + m = len(data) - min_len + 1 + indices = np.repeat([np.arange(emb_dim) * lag], m, axis=0) + indices += np.arange(m).reshape((m, 1)) + return data[indices] def lyap_r_len(**kwargs): - """Helper function that calculates the minimum number of data points required - to use lyap_r. - - Note that none of the required parameters may be set to None. - - Kwargs: - kwargs(dict): - arguments used for lyap_r (required: emb_dim, lag, trajectory_len and - min_tsep) - - Returns: - minimum number of data points required to call lyap_r with the given - parameters - """ - # minimum length required to find single orbit vector - min_len = (kwargs["emb_dim"] - 1) * kwargs["lag"] + 1 - # we need trajectory_len orbit vectors to follow a complete trajectory - min_len += kwargs["trajectory_len"] - 1 - # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs["min_tsep"] * 2 + 1 - return min_len - - -def lyap_r(data, emb_dim=10, lag=None, min_tsep=None, tau=1, min_neighbors=20, - trajectory_len=20, fit="RANSAC", debug_plot=False, debug_data=False, - plot_file=None, fit_offset=0): - """Estimates the largest Lyapunov exponent using the algorithm of Rosenstein - et al. [lr_1]_. - - Explanation of Lyapunov exponents: - See lyap_e. - - Explanation of the algorithm: - The algorithm of Rosenstein et al. is only able to recover the largest - Lyapunov exponent, but behaves rather robust to parameter choices. - - The idea for the algorithm relates closely to the definition of Lyapunov - exponents. First, the dynamics of the data are reconstructed using a delay - embedding method with a lag, such that each value x_i of the data is mapped - to the vector - - X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] - - For each such vector X_i, we find the closest neighbor X_j using the - euclidean distance. We know that as we follow the trajectories from X_i and - X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) - denoted as d_i(k) will increase according to a power law - d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the - highest Lyapunov exponent, because the exponential expansion along the axis - associated with this exponent will quickly dominate the expansion or - contraction along other axes. - - To calculate lambda, we look at the logarithm of the distance trajectory, - because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines - (one for each index i) whose slope is an approximation of lambda. We - therefore extract the mean log trajectory d'(k) by taking the mean of - log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to - the plot of d'(k) versus k. The slope of the line gives the desired - parameter lambda. - - Method for choosing min_tsep: - Usually we want to find neighbors between points that are close in phase - space but not too close in time, because we want to avoid spurious - correlations between the obtained trajectories that originate from temporal - dependencies rather than the dynamic properties of the system. Therefore it - is critical to find a good value for min_tsep. One rather plausible - estimate for this value is to set min_tsep to the mean period of the - signal, which can be obtained by calculating the mean frequency using the - fast fourier transform. This procedure is used by default if the user sets - min_tsep = None. Note that this default procedure uses a naive approach - for estimating the power spectral density, which just takes the FFT of the - whole signal without applying any windowing function to avoid biases. If - you have a non-stationary input and want more than a rough estimate, - consider calculating min_tsep manually using a sliding window approach - like Welch's method (implemented in `scipy.signal.welch`). - - Method for choosing lag: - Another parameter that can be hard to choose by instinct alone is the lag - between individual values in a vector of the embedded orbit. Here, - Rosenstein et al. suggest to set the lag to the distance where the - autocorrelation function drops below 1 - 1/e times its original (maximal) - value. This procedure is used by default if the user sets lag = None. - - References: - .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, - “A practical method for calculating largest Lyapunov exponents from - small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, - pp. 117–134, 1993. - - Reference Code: - .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", - url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm - .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate - Lyapunov exponent", - url: https://ideas.repec.org/c/boc/bocode/t741502.html - .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, "TISEAN 3.0.0 - Nonlinear Time Series Analysis", - url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html - - Args: - data (iterable of float): - (one-dimensional) time series - Kwargs: - emb_dim (int): - embedding dimension for delay embedding - lag (float): - lag for delay embedding - min_tsep (float): - minimal temporal separation between two "neighbors" (default: - find a suitable value by calculating the mean period of the data) - tau (float): - step size between data points in the time series in seconds - (normalization scaling factor for exponents) - min_neighbors (int): - if lag=None, the search for a suitable lag will be stopped when the - number of potential neighbors for a vector drops below min_neighbors - trajectory_len (int): - the time (in number of data points) to follow the distance - trajectories between two neighboring points - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will - be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - fit_offset (int): - neglect the first fit_offset steps when fitting - - Returns: - float: - an estimate of the largest Lyapunov exponent (a positive exponent is - a strong indicator for chaos) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(ks, div_traj, poly)`` where ``ks`` are the x-values of the line fit, - ``div_traj`` are the y-values and ``poly`` are the line coefficients - (``[slope, intercept]``). - - """ - # convert data to float to avoid overflow errors in rowwise_euclidean - data = np.asarray(data, dtype=np.float64) - n = len(data) - max_tsep_factor = 0.25 - if lag is None or min_tsep is None: - # both the algorithm for lag and min_tsep need the fft - f = np.fft.rfft(data, n * 2 - 1) - if min_tsep is None: - # calculate min_tsep as mean period (= 1 / mean frequency) - # to get the mean frequency, we weight the frequency buckets in the - # fft result by the absolute power in that bucket and then divide - # by the total power across all buckets to get a weighted mean. - # This can be inaccurate for non-stationary inputs. A better approach would - # be to use scipy.signal.welch, but this requires making some other - # parameter choices like the size of the sliding window that require some - # knowledge about the input data, which we don't have at this point. - freqs = np.fft.rfftfreq(n * 2 - 1) - psd = np.abs(f)**2 - mf = np.sum(freqs[1:] * psd[1:]) / np.sum(psd[1:]) - min_tsep = int(np.ceil(1.0 / mf)) - if min_tsep > max_tsep_factor * n: - min_tsep = int(max_tsep_factor * n) - msg = "signal has very low mean frequency, setting min_tsep = {:d}" - warnings.warn(msg.format(min_tsep), RuntimeWarning, stacklevel=2) - if lag is None: - # calculate the lag as point where the autocorrelation drops to (1 - 1/e) - # times its maximum value - # note: the Wiener–Khinchin theorem states that the spectral - # decomposition of the autocorrelation function of a process is the power - # spectrum of that process - # => we can use fft to calculate the autocorrelation - acorr = np.fft.irfft(f * np.conj(f)) - acorr = np.roll(acorr, n - 1) - eps = acorr[n - 1] * (1 - 1.0 / np.e) - lag = 1 - - # small helper function to calculate resulting number of vectors for a - # given lag value - def nb_neighbors(lag_value): - min_len = lyap_r_len( - emb_dim=emb_dim, lag=lag_value, trajectory_len=trajectory_len, + """Helper function that calculates the minimum number of data points required + to use lyap_r. + + Note that none of the required parameters may be set to None. + + Kwargs: + kwargs(dict): + arguments used for lyap_r (required: emb_dim, lag, trajectory_len and + min_tsep) + + Returns: + minimum number of data points required to call lyap_r with the given + parameters + """ + # minimum length required to find single orbit vector + min_len = (kwargs["emb_dim"] - 1) * kwargs["lag"] + 1 + # we need trajectory_len orbit vectors to follow a complete trajectory + min_len += kwargs["trajectory_len"] - 1 + # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each + min_len += kwargs["min_tsep"] * 2 + 1 + return min_len + + +def lyap_r( + data, + emb_dim=10, + lag=None, + min_tsep=None, + tau=1, + min_neighbors=20, + trajectory_len=20, + fit="RANSAC", + debug_plot=False, + debug_data=False, + plot_file=None, + fit_offset=0, +): + """Estimates the largest Lyapunov exponent using the algorithm of Rosenstein + et al. [lr_1]_. + + Explanation of Lyapunov exponents: + See lyap_e. + + Explanation of the algorithm: + The algorithm of Rosenstein et al. is only able to recover the largest + Lyapunov exponent, but behaves rather robust to parameter choices. + + The idea for the algorithm relates closely to the definition of Lyapunov + exponents. First, the dynamics of the data are reconstructed using a delay + embedding method with a lag, such that each value x_i of the data is mapped + to the vector + + X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] + + For each such vector X_i, we find the closest neighbor X_j using the + euclidean distance. We know that as we follow the trajectories from X_i and + X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) + denoted as d_i(k) will increase according to a power law + d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the + highest Lyapunov exponent, because the exponential expansion along the axis + associated with this exponent will quickly dominate the expansion or + contraction along other axes. + + To calculate lambda, we look at the logarithm of the distance trajectory, + because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines + (one for each index i) whose slope is an approximation of lambda. We + therefore extract the mean log trajectory d'(k) by taking the mean of + log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to + the plot of d'(k) versus k. The slope of the line gives the desired + parameter lambda. + + Method for choosing min_tsep: + Usually we want to find neighbors between points that are close in phase + space but not too close in time, because we want to avoid spurious + correlations between the obtained trajectories that originate from temporal + dependencies rather than the dynamic properties of the system. Therefore it + is critical to find a good value for min_tsep. One rather plausible + estimate for this value is to set min_tsep to the mean period of the + signal, which can be obtained by calculating the mean frequency using the + fast fourier transform. This procedure is used by default if the user sets + min_tsep = None. Note that this default procedure uses a naive approach + for estimating the power spectral density, which just takes the FFT of the + whole signal without applying any windowing function to avoid biases. If + you have a non-stationary input and want more than a rough estimate, + consider calculating min_tsep manually using a sliding window approach + like Welch's method (implemented in `scipy.signal.welch`). + + Method for choosing lag: + Another parameter that can be hard to choose by instinct alone is the lag + between individual values in a vector of the embedded orbit. Here, + Rosenstein et al. suggest to set the lag to the distance where the + autocorrelation function drops below 1 - 1/e times its original (maximal) + value. This procedure is used by default if the user sets lag = None. + + References: + .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, + “A practical method for calculating largest Lyapunov exponents from + small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, + pp. 117–134, 1993. + + Reference Code: + .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", + url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm + .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate + Lyapunov exponent", + url: https://ideas.repec.org/c/boc/bocode/t741502.html + .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, "TISEAN 3.0.0 - Nonlinear Time Series Analysis", + url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html + + Args: + data (iterable of float): + (one-dimensional) time series + Kwargs: + emb_dim (int): + embedding dimension for delay embedding + lag (float): + lag for delay embedding + min_tsep (float): + minimal temporal separation between two "neighbors" (default: + find a suitable value by calculating the mean period of the data) + tau (float): + step size between data points in the time series in seconds + (normalization scaling factor for exponents) + min_neighbors (int): + if lag=None, the search for a suitable lag will be stopped when the + number of potential neighbors for a vector drops below min_neighbors + trajectory_len (int): + the time (in number of data points) to follow the distance + trajectories between two neighboring points + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will + be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + fit_offset (int): + neglect the first fit_offset steps when fitting + + Returns: + float: + an estimate of the largest Lyapunov exponent (a positive exponent is + a strong indicator for chaos) + (1d-vector, 1d-vector, list): + only present if debug_data is True: debug data of the form + ``(ks, div_traj, poly)`` where ``ks`` are the x-values of the line fit, + ``div_traj`` are the y-values and ``poly`` are the line coefficients + (``[slope, intercept]``). + + """ + # convert data to float to avoid overflow errors in rowwise_euclidean + data = np.asarray(data, dtype=np.float64) + n = len(data) + max_tsep_factor = 0.25 + if lag is None or min_tsep is None: + # both the algorithm for lag and min_tsep need the fft + f = np.fft.rfft(data, n * 2 - 1) + if min_tsep is None: + # calculate min_tsep as mean period (= 1 / mean frequency) + # to get the mean frequency, we weight the frequency buckets in the + # fft result by the absolute power in that bucket and then divide + # by the total power across all buckets to get a weighted mean. + # This can be inaccurate for non-stationary inputs. A better approach would + # be to use scipy.signal.welch, but this requires making some other + # parameter choices like the size of the sliding window that require some + # knowledge about the input data, which we don't have at this point. + freqs = np.fft.rfftfreq(n * 2 - 1) + psd = np.abs(f) ** 2 + mf = np.sum(freqs[1:] * psd[1:]) / np.sum(psd[1:]) + min_tsep = int(np.ceil(1.0 / mf)) + if min_tsep > max_tsep_factor * n: + min_tsep = int(max_tsep_factor * n) + msg = "signal has very low mean frequency, setting min_tsep = {:d}" + warnings.warn(msg.format(min_tsep), RuntimeWarning, stacklevel=2) + if lag is None: + # calculate the lag as point where the autocorrelation drops to (1 - 1/e) + # times its maximum value + # note: the Wiener–Khinchin theorem states that the spectral + # decomposition of the autocorrelation function of a process is the power + # spectrum of that process + # => we can use fft to calculate the autocorrelation + acorr = np.fft.irfft(f * np.conj(f)) + acorr = np.roll(acorr, n - 1) + eps = acorr[n - 1] * (1 - 1.0 / np.e) + lag = 1 + + # small helper function to calculate resulting number of vectors for a + # given lag value + def nb_neighbors(lag_value): + min_len = lyap_r_len( + emb_dim=emb_dim, + lag=lag_value, + trajectory_len=trajectory_len, + min_tsep=min_tsep, + ) + return max(0, n - min_len) + + # find lag + for i in range(1, n): + lag = i + if acorr[n - 1 + i] < eps or acorr[n - 1 - i] < eps: + break + if nb_neighbors(i) < min_neighbors: + msg = "autocorrelation declined too slowly to find suitable lag, setting lag to {}" + warnings.warn(msg.format(lag), RuntimeWarning, stacklevel=2) + break + min_len = lyap_r_len( + emb_dim=emb_dim, + lag=lag, + trajectory_len=trajectory_len, min_tsep=min_tsep, - ) - return max(0, n - min_len) - # find lag - for i in range(1, n): - lag = i - if acorr[n - 1 + i] < eps or acorr[n - 1 - i] < eps: - break - if nb_neighbors(i) < min_neighbors: - msg = "autocorrelation declined too slowly to find suitable lag" \ - ", setting lag to {}" - warnings.warn(msg.format(lag), RuntimeWarning, stacklevel=2) - break - min_len = lyap_r_len( - emb_dim=emb_dim, lag=lag, trajectory_len=trajectory_len, - min_tsep=min_tsep, - ) - if len(data) < min_len: - msg = "for emb_dim = {}, lag = {}, min_tsep = {} and trajectory_len = {}" \ - " you need at least {} datapoints in your time series" - warnings.warn( - msg.format(emb_dim, lag, min_tsep, trajectory_len, min_len), - RuntimeWarning, stacklevel=2, ) - # delay embedding - orbit = delay_embedding(data, emb_dim, lag) - m = len(orbit) - # construct matrix with pairwise distances between vectors in orbit - dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)]) - # we do not want to consider vectors as neighbor that are less than min_tsep - # time steps together => mask the distances min_tsep to the right and left of - # each index by setting them to infinity (will never be considered as nearest - # neighbors) - for i in range(m): - dists[i, max(0, i - min_tsep):i + min_tsep + 1] = float("inf") - # check that we have enough data points to continue - ntraj = m - trajectory_len + 1 - min_traj = min_tsep * 2 + 2 # in each row min_tsep + 1 disances are inf - if ntraj <= 0: - msg = "Not enough data points. Need {} additional data points to follow " \ - "a complete trajectory." - raise ValueError(msg.format(-ntraj+1)) - if ntraj < min_traj: - # not enough data points => there are rows where all values are inf - assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)) - msg = "Not enough data points. At least {} trajectories are required " \ - "to find a valid neighbor for each orbit vector with min_tsep={} " \ - "but only {} could be created." - raise ValueError(msg.format(min_traj, min_tsep, ntraj)) - assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)) - # find nearest neighbors (exclude last columns, because these vectors cannot - # be followed in time for trajectory_len steps) - nb_idx = np.argmin(dists[:ntraj, :ntraj], axis=1) - - # build divergence trajectory by averaging distances along the trajectory - # over all neighbor pairs - div_traj = np.zeros(trajectory_len, dtype=float) - for k in range(trajectory_len): - # calculate mean trajectory distance at step k - indices = (np.arange(ntraj) + k, nb_idx + k) - div_traj_k = dists[indices] - # filter entries where distance is zero (would lead to -inf after log) - nonzero = np.where(div_traj_k != 0) - if len(nonzero[0]) == 0: - # if all entries where zero, we have to use -inf - div_traj[k] = -np.inf + if len(data) < min_len: + msg = ( + "for emb_dim = {}, lag = {}, min_tsep = {} and trajectory_len = {}" + " you need at least {} datapoints in your time series" + ) + warnings.warn( + msg.format(emb_dim, lag, min_tsep, trajectory_len, min_len), + RuntimeWarning, + stacklevel=2, + ) + # delay embedding + orbit = delay_embedding(data, emb_dim, lag) + m = len(orbit) + # construct matrix with pairwise distances between vectors in orbit + dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)]) + # we do not want to consider vectors as neighbor that are less than min_tsep + # time steps together => mask the distances min_tsep to the right and left of + # each index by setting them to infinity (will never be considered as nearest + # neighbors) + for i in range(m): + dists[i, max(0, i - min_tsep) : i + min_tsep + 1] = float("inf") + # check that we have enough data points to continue + ntraj = m - trajectory_len + 1 + min_traj = min_tsep * 2 + 2 # in each row min_tsep + 1 disances are inf + if ntraj <= 0: + msg = ( + "Not enough data points. Need {} additional data points to follow " + "a complete trajectory." + ) + raise ValueError(msg.format(-ntraj + 1)) + if ntraj < min_traj: + # not enough data points => there are rows where all values are inf + assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)) + msg = ( + "Not enough data points. At least {} trajectories are required " + "to find a valid neighbor for each orbit vector with min_tsep={} " + "but only {} could be created." + ) + raise ValueError(msg.format(min_traj, min_tsep, ntraj)) + assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)) + # find nearest neighbors (exclude last columns, because these vectors cannot + # be followed in time for trajectory_len steps) + nb_idx = np.argmin(dists[:ntraj, :ntraj], axis=1) + + # build divergence trajectory by averaging distances along the trajectory + # over all neighbor pairs + div_traj = np.zeros(trajectory_len, dtype=float) + for k in range(trajectory_len): + # calculate mean trajectory distance at step k + indices = (np.arange(ntraj) + k, nb_idx + k) + div_traj_k = dists[indices] + # filter entries where distance is zero (would lead to -inf after log) + nonzero = np.where(div_traj_k != 0) + if len(nonzero[0]) == 0: + # if all entries where zero, we have to use -inf + div_traj[k] = -np.inf + else: + div_traj[k] = np.mean(np.log(div_traj_k[nonzero])) + # filter -inf entries from mean trajectory + ks = np.arange(trajectory_len) + finite = np.where(np.isfinite(div_traj)) + ks = ks[finite] + div_traj = div_traj[finite] + if len(ks) < 1: + # if all points or all but one point in the trajectory is -inf, we cannot + # fit a line through the remaining points => return -inf as exponent + poly = [-np.inf, 0] else: - div_traj[k] = np.mean(np.log(div_traj_k[nonzero])) - # filter -inf entries from mean trajectory - ks = np.arange(trajectory_len) - finite = np.where(np.isfinite(div_traj)) - ks = ks[finite] - div_traj = div_traj[finite] - if len(ks) < 1: - # if all points or all but one point in the trajectory is -inf, we cannot - # fit a line through the remaining points => return -inf as exponent - poly = [-np.inf, 0] - else: - # normal line fitting - poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) - if debug_plot: - plot_reg( - ks[fit_offset:], div_traj[fit_offset:], - poly, "k", "log(d(k))", fname=plot_file) - le = poly[0] / tau - if debug_data: - return (le, (ks, div_traj, poly)) - return le + # normal line fitting + poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) + if debug_plot: + plot_reg(ks[fit_offset:], div_traj[fit_offset:], poly, "k", "log(d(k))", fname=plot_file) + le = poly[0] / tau + if debug_data: + return (le, (ks, div_traj, poly)) + return le def lyap_e_len(**kwargs): - """Helper function that calculates the minimum number of data points required - to use lyap_e. - - Note that none of the required parameters may be set to None. - - Kwargs: - kwargs(dict): - arguments used for lyap_e (required: emb_dim, matrix_dim, min_nb - and min_tsep) - - Returns: - minimum number of data points required to call lyap_e with the given - parameters - """ - m = (kwargs["emb_dim"] - 1) // (kwargs["matrix_dim"] - 1) - # minimum length required to find single orbit vector - min_len = kwargs["emb_dim"] - # we need to follow each starting point of an orbit vector for m more steps - min_len += m - # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs["min_tsep"] * 2 - # we need at least min_nb neighbors for each orbit vector - min_len += kwargs["min_nb"] - return min_len - - -def lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=None, min_tsep=0, tau=1, - debug_plot=False, debug_data=False, plot_file=None): - r"""Estimates the Lyapunov exponents for the given data using the algorithm of - Eckmann et al. [le_1]_. - - Recommendations for parameter settings by Eckmann et al.: - * long recording time improves accuracy, small tau does not - * use large values for emb_dim - * matrix_dim should be 'somewhat larger than the expected number of - positive Lyapunov exponents' - * min_nb = min(2 * matrix_dim, matrix_dim + 4) - - Explanation of Lyapunov exponents: - The Lyapunov exponent describes the rate of separation of two - infinitesimally close trajectories of a dynamical system in phase space. - In a chaotic system, these trajectories diverge exponentially following - the equation: - - \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| - - In this equation X(t, X_0) is the trajectory of the system X starting at - the point X_0 in phase space at time t. eps is the (infinitesimal) - difference vector and lambda is called the Lyapunov exponent. If the - system has more than one free variable, the phase space is - multidimensional and each dimension has its own Lyapunov exponent. The - existence of at least one positive Lyapunov exponent is generally seen as - a strong indicator for chaos. - - Explanation of the Algorithm: - To calculate the Lyapunov exponents analytically, the Jacobian of the - system is required. The algorithm of Eckmann et al. therefore tries to - estimate this Jacobian by reconstructing the dynamics of the system from - which the time series was obtained. For this, several steps are required: - - * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim - dimensions (map each point x_i of the time series to a vector - [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). - * For each vector X_i in this orbit find a radius r_i so that at least - min_nb other vectors lie within (chebyshev-)distance r_i around X_i. - These vectors will be called "neighbors" of X_i. - * Find the Matrix T_i that sends points from the neighborhood of X_i to - the neighborhood of X_(i+1). To avoid undetermined values in T_i, we - construct T_i not with size (emb_dim x emb_dim) but with size - (matrix_dim x matrix_dim), so that we have a larger "step size" m in the - X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), - ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible - by matrix_dim-1. The T_i are then found by a linear least squares fit, - assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the - neighborhood of X_i. - * Starting with i = 1 and Q_0 = identity successively decompose the matrix - T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. - * Calculate the Lyapunov exponents from the mean of the logarithm of the - diagonal elements of the matrices R_i. To normalize the Lyapunov - exponents, they have to be divided by m and by the step size tau of the - original time series. - - References: - .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, - “Liapunov exponents from time series,” Physical Review A, - vol. 34, no. 6, pp. 4971–4979, 1986. - - Reference code: - .. [le_a] Manfred Füllsack, "Lyapunov exponent", - url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), - url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m - .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, - url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html - - Args: - data (array-like of float): - (scalar) data points - - Kwargs: - emb_dim (int): - embedding dimension - matrix_dim (int): - matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) - min_nb (int): - minimal number of neighbors - (default: min(2 * matrix_dim, matrix_dim + 4)) - min_tsep (int): - minimal temporal separation between two "neighbors" - tau (float): - step size of the data in seconds - (normalization scaling factor for exponents) - debug_plot (boolean): - if True, a histogram matrix of the individual estimates will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float array: - array of matrix_dim Lyapunov exponents (positive exponents are indicators - for chaos) - 2d-array of floats: - only present if debug_data is True: all estimates for the matrix_dim - Lyapunov exponents from the x iterations of R_i. The shape of this debug - data is (x, matrix_dim). - """ - # convert to float to avoid errors when using 'inf' as distance - data = np.asarray(data, dtype=np.float64) - n = len(data) - if (emb_dim - 1) % (matrix_dim - 1) != 0: - msg = "emb_dim - 1 must be divisible by matrix_dim - 1!" - raise ValueError(msg) - m = (emb_dim - 1) // (matrix_dim - 1) - if min_nb is None: - # minimal number of neighbors as suggested by Eckmann et al. - min_nb = min(2 * matrix_dim, matrix_dim + 4) - - min_len = lyap_e_len( - emb_dim=emb_dim, matrix_dim=matrix_dim, min_nb=min_nb, min_tsep=min_tsep, - ) - if n < min_len: - msg = "{} data points are not enough! For emb_dim = {}, matrix_dim = {}" \ - ", min_tsep = {} and min_nb = {} you need at least {} data points " \ - "in your time series" - warnings.warn( - msg.format(n, emb_dim, matrix_dim, min_tsep, min_nb, min_len), - RuntimeWarning, stacklevel=2, - ) + """Helper function that calculates the minimum number of data points required + to use lyap_e. - # construct orbit as matrix (e = emb_dim) - # x0 x1 x2 ... xe-1 - # x1 x2 x3 ... xe - # x2 x3 x4 ... xe+1 - # ... - - # note: we need to be able to step m points further for the beta vector - # => maximum start index is n - emb_dim - m - orbit = delay_embedding(data[:-m], emb_dim, lag=1) - if len(orbit) < min_nb: - assert len(data) < min_len - msg = "Not enough data points. Need at least {} additional data points " \ - "to have min_nb = {} neighbor candidates" - raise ValueError(msg.format(min_nb-len(orbit), min_nb)) - old_Q = np.identity(matrix_dim) - lexp = np.zeros(matrix_dim, dtype=np.float64) - lexp_counts = np.zeros(lexp.shape) - debug_values = [] - # TODO reduce number of points to visit? - # TODO performance test! - for i in range(len(orbit)): - # find neighbors for each vector in the orbit using the chebyshev distance - diffs = rowwise_chebyshev(orbit, orbit[i]) - # ensure that we do not count the difference of the vector to itself - diffs[i] = float("inf") - # mask all neighbors that are too close in time to the vector itself - mask_from = max(0, i - min_tsep) - mask_to = min(len(diffs), i + min_tsep + 1) - diffs[mask_from:mask_to] = np.inf - indices = np.argsort(diffs) - idx = indices[min_nb - 1] # index of the min_nb-nearest neighbor - r = diffs[idx] # corresponding distance - if np.isinf(r): - assert len(data) < min_len - msg = "Not enough data points. Orbit vector {} has less than min_nb = " \ - "{} valid neighbors that are at least min_tsep = {} time steps " \ - "away. Input must have at least length {}." - raise ValueError(msg.format(i, min_nb, min_tsep, min_len)) - # there may be more than min_nb vectors at distance r (if multiple vectors - # have a distance of exactly r) - # => update index accordingly - indices = np.where(diffs <= r)[0] - - # find the matrix T_i that satisifies - # T_i (orbit'[j] - orbit'[i]) = (orbit'[j+m] - orbit'[i+m]) - # for all neighbors j where orbit'[i] = [x[i], x[i+m], - # ... x[i + (matrix_dim-1)*m]] - - # note that T_i has the following form: - # 0 1 0 ... 0 - # 0 0 1 ... 0 - # ... - # a0 a1 a2 ... a(matrix_dim-1) - - # This is because for all rows except the last one the aforementioned - # equation has a clear solution since orbit'[j+m] - orbit'[i+m] = - # [x[j+m]-x[i+m], x[j+2*m]-x[i+2*m], ... x[j+d_M*m]-x[i+d_M*m]] - # and - # orbit'[j] - orbit'[i] = - # [x[j]-x[i], x[j+m]-x[i+m], ... x[j+(d_M-1)*m]-x[i+(d_M-1)*m]] - # therefore x[j+k*m] - x[i+k*m] is already contained in - # orbit'[j] - orbit'[x] for all k from 1 to matrix_dim-1. Only for - # k = matrix_dim there is an actual problem to solve. - - # We can therefore find a = [a0, a1, a2, ... a(matrix_dim-1)] by - # formulating a linear least squares problem (mat_X * a = vec_beta) - # as follows. - - # build matrix X for linear least squares (d_M = matrix_dim) - # x_j1 - x_i x_j1+m - x_i+m ... x_j1+(d_M-1)m - x_i+(d_M-1)m - # x_j2 - x_i x_j2+m - x_i+m ... x_j2+(d_M-1)m - x_i+(d_M-1)m - # ... + Note that none of the required parameters may be set to None. - # note: emb_dim = (d_M - 1) * m + 1 - mat_X = np.array([data[j:j + emb_dim:m] for j in indices]) - mat_X -= data[i:i + emb_dim:m] + Kwargs: + kwargs(dict): + arguments used for lyap_e (required: emb_dim, matrix_dim, min_nb + and min_tsep) - # build vector beta for linear least squares - # x_j1+(d_M)m - x_i+(d_M)m - # x_j2+(d_M)m - x_i+(d_M)m - # ... - if max(np.max(indices), i) + matrix_dim * m >= len(data): - assert len(data) < min_len - msg = "Not enough data points. Cannot follow orbit vector {} for " \ - "{} (matrix_dim * m) time steps. Input must have at least " \ - "length {}." - raise ValueError(msg.format(i, matrix_dim * m, min_len)) - vec_beta = data[indices + matrix_dim * m] - data[i + matrix_dim * m] - - # perform linear least squares - a, _, _, _ = np.linalg.lstsq(mat_X, vec_beta, rcond=-1) - # build matrix T - # 0 1 0 ... 0 - # 0 0 1 ... 0 + Returns: + minimum number of data points required to call lyap_e with the given + parameters + """ + m = (kwargs["emb_dim"] - 1) // (kwargs["matrix_dim"] - 1) + # minimum length required to find single orbit vector + min_len = kwargs["emb_dim"] + # we need to follow each starting point of an orbit vector for m more steps + min_len += m + # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each + min_len += kwargs["min_tsep"] * 2 + # we need at least min_nb neighbors for each orbit vector + min_len += kwargs["min_nb"] + return min_len + + +def lyap_e( + data, + emb_dim=10, + matrix_dim=4, + min_nb=None, + min_tsep=0, + tau=1, + debug_plot=False, + debug_data=False, + plot_file=None, +): + r"""Estimates the Lyapunov exponents for the given data using the algorithm of + Eckmann et al. [le_1]_. + + Recommendations for parameter settings by Eckmann et al.: + * long recording time improves accuracy, small tau does not + * use large values for emb_dim + * matrix_dim should be 'somewhat larger than the expected number of + positive Lyapunov exponents' + * min_nb = min(2 * matrix_dim, matrix_dim + 4) + + Explanation of Lyapunov exponents: + The Lyapunov exponent describes the rate of separation of two + infinitesimally close trajectories of a dynamical system in phase space. + In a chaotic system, these trajectories diverge exponentially following + the equation: + + \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| + + In this equation X(t, X_0) is the trajectory of the system X starting at + the point X_0 in phase space at time t. eps is the (infinitesimal) + difference vector and lambda is called the Lyapunov exponent. If the + system has more than one free variable, the phase space is + multidimensional and each dimension has its own Lyapunov exponent. The + existence of at least one positive Lyapunov exponent is generally seen as + a strong indicator for chaos. + + Explanation of the Algorithm: + To calculate the Lyapunov exponents analytically, the Jacobian of the + system is required. The algorithm of Eckmann et al. therefore tries to + estimate this Jacobian by reconstructing the dynamics of the system from + which the time series was obtained. For this, several steps are required: + + * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim + dimensions (map each point x_i of the time series to a vector + [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). + * For each vector X_i in this orbit find a radius r_i so that at least + min_nb other vectors lie within (chebyshev-)distance r_i around X_i. + These vectors will be called "neighbors" of X_i. + * Find the Matrix T_i that sends points from the neighborhood of X_i to + the neighborhood of X_(i+1). To avoid undetermined values in T_i, we + construct T_i not with size (emb_dim x emb_dim) but with size + (matrix_dim x matrix_dim), so that we have a larger "step size" m in the + X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), + ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible + by matrix_dim-1. The T_i are then found by a linear least squares fit, + assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the + neighborhood of X_i. + * Starting with i = 1 and Q_0 = identity successively decompose the matrix + T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. + * Calculate the Lyapunov exponents from the mean of the logarithm of the + diagonal elements of the matrices R_i. To normalize the Lyapunov + exponents, they have to be divided by m and by the step size tau of the + original time series. + + References: + .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, + “Liapunov exponents from time series,” Physical Review A, + vol. 34, no. 6, pp. 4971–4979, 1986. + + Reference code: + .. [le_a] Manfred Füllsack, "Lyapunov exponent", + url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html + .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), + url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m + .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, + url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html + + Args: + data (array-like of float): + (scalar) data points + + Kwargs: + emb_dim (int): + embedding dimension + matrix_dim (int): + matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb (int): + minimal number of neighbors + (default: min(2 * matrix_dim, matrix_dim + 4)) + min_tsep (int): + minimal temporal separation between two "neighbors" + tau (float): + step size of the data in seconds + (normalization scaling factor for exponents) + debug_plot (boolean): + if True, a histogram matrix of the individual estimates will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + float array: + array of matrix_dim Lyapunov exponents (positive exponents are indicators + for chaos) + 2d-array of floats: + only present if debug_data is True: all estimates for the matrix_dim + Lyapunov exponents from the x iterations of R_i. The shape of this debug + data is (x, matrix_dim). + """ + # convert to float to avoid errors when using 'inf' as distance + data = np.asarray(data, dtype=np.float64) + n = len(data) + if (emb_dim - 1) % (matrix_dim - 1) != 0: + msg = "emb_dim - 1 must be divisible by matrix_dim - 1!" + raise ValueError(msg) + m = (emb_dim - 1) // (matrix_dim - 1) + if min_nb is None: + # minimal number of neighbors as suggested by Eckmann et al. + min_nb = min(2 * matrix_dim, matrix_dim + 4) + + min_len = lyap_e_len( + emb_dim=emb_dim, + matrix_dim=matrix_dim, + min_nb=min_nb, + min_tsep=min_tsep, + ) + if n < min_len: + msg = ( + "{} data points are not enough! For emb_dim = {}, matrix_dim = {}" + ", min_tsep = {} and min_nb = {} you need at least {} data points " + "in your time series" + ) + warnings.warn( + msg.format(n, emb_dim, matrix_dim, min_tsep, min_nb, min_len), + RuntimeWarning, + stacklevel=2, + ) + + # construct orbit as matrix (e = emb_dim) + # x0 x1 x2 ... xe-1 + # x1 x2 x3 ... xe + # x2 x3 x4 ... xe+1 # ... - # 0 0 0 ... 1 - # a1 a2 a3 ... a_(d_M) - mat_T = np.zeros((matrix_dim, matrix_dim)) - mat_T[:-1, 1:] = np.identity(matrix_dim - 1) - mat_T[-1] = a - - # QR-decomposition of T * old_Q - mat_Q, mat_R = np.linalg.qr(np.dot(mat_T, old_Q)) - # force diagonal of R to be positive - # (if QR = A then also QLL'R = A with L' = L^-1) - sign_diag = np.sign(np.diag(mat_R)) - sign_diag[np.where(sign_diag == 0)] = 1 - sign_diag = np.diag(sign_diag) - mat_Q = np.dot(mat_Q, sign_diag) - mat_R = np.dot(sign_diag, mat_R) - - old_Q = mat_Q - # successively build sum for Lyapunov exponents - diag_R = np.diag(mat_R) - # filter zeros in mat_R (would lead to -infs) - idx = np.where(diag_R > 0) - lexp_i = np.zeros(diag_R.shape, dtype=np.float64) - lexp_i[idx] = np.log(diag_R[idx]) - lexp_i[np.where(diag_R == 0)] = np.inf - if debug_plot or debug_data: - debug_values.append(lexp_i / tau / m) - lexp[idx] += lexp_i[idx] - lexp_counts[idx] += 1 - # end of loop over orbit vectors - # it may happen that all R-matrices contained zeros => exponent really has - # to be -inf - if debug_plot: - plot_histogram_matrix(np.array(debug_values), "layp_e", fname=plot_file) - # normalize exponents over number of individual mat_Rs - idx = np.where(lexp_counts > 0) - lexp[idx] /= lexp_counts[idx] - lexp[np.where(lexp_counts == 0)] = np.inf - # normalize with respect to tau - lexp /= tau - # take m into account - lexp /= m - if debug_data: - return (lexp, np.array(debug_values)) - return lexp + + # note: we need to be able to step m points further for the beta vector + # => maximum start index is n - emb_dim - m + orbit = delay_embedding(data[:-m], emb_dim, lag=1) + if len(orbit) < min_nb: + assert len(data) < min_len + msg = ( + "Not enough data points. Need at least {} additional data points " + "to have min_nb = {} neighbor candidates" + ) + raise ValueError(msg.format(min_nb - len(orbit), min_nb)) + old_Q = np.identity(matrix_dim) + lexp = np.zeros(matrix_dim, dtype=np.float64) + lexp_counts = np.zeros(lexp.shape) + debug_values = [] + # TODO reduce number of points to visit? + # TODO performance test! + for i in range(len(orbit)): + # find neighbors for each vector in the orbit using the chebyshev distance + diffs = rowwise_chebyshev(orbit, orbit[i]) + # ensure that we do not count the difference of the vector to itself + diffs[i] = float("inf") + # mask all neighbors that are too close in time to the vector itself + mask_from = max(0, i - min_tsep) + mask_to = min(len(diffs), i + min_tsep + 1) + diffs[mask_from:mask_to] = np.inf + indices = np.argsort(diffs) + idx = indices[min_nb - 1] # index of the min_nb-nearest neighbor + r = diffs[idx] # corresponding distance + if np.isinf(r): + assert len(data) < min_len + msg = ( + "Not enough data points. Orbit vector {} has less than min_nb = " + "{} valid neighbors that are at least min_tsep = {} time steps " + "away. Input must have at least length {}." + ) + raise ValueError(msg.format(i, min_nb, min_tsep, min_len)) + # there may be more than min_nb vectors at distance r (if multiple vectors + # have a distance of exactly r) + # => update index accordingly + indices = np.where(diffs <= r)[0] + + # find the matrix T_i that satisifies + # T_i (orbit'[j] - orbit'[i]) = (orbit'[j+m] - orbit'[i+m]) + # for all neighbors j where orbit'[i] = [x[i], x[i+m], + # ... x[i + (matrix_dim-1)*m]] + + # note that T_i has the following form: + # 0 1 0 ... 0 + # 0 0 1 ... 0 + # ... + # a0 a1 a2 ... a(matrix_dim-1) + + # This is because for all rows except the last one the aforementioned + # equation has a clear solution since orbit'[j+m] - orbit'[i+m] = + # [x[j+m]-x[i+m], x[j+2*m]-x[i+2*m], ... x[j+d_M*m]-x[i+d_M*m]] + # and + # orbit'[j] - orbit'[i] = + # [x[j]-x[i], x[j+m]-x[i+m], ... x[j+(d_M-1)*m]-x[i+(d_M-1)*m]] + # therefore x[j+k*m] - x[i+k*m] is already contained in + # orbit'[j] - orbit'[x] for all k from 1 to matrix_dim-1. Only for + # k = matrix_dim there is an actual problem to solve. + + # We can therefore find a = [a0, a1, a2, ... a(matrix_dim-1)] by + # formulating a linear least squares problem (mat_X * a = vec_beta) + # as follows. + + # build matrix X for linear least squares (d_M = matrix_dim) + # x_j1 - x_i x_j1+m - x_i+m ... x_j1+(d_M-1)m - x_i+(d_M-1)m + # x_j2 - x_i x_j2+m - x_i+m ... x_j2+(d_M-1)m - x_i+(d_M-1)m + # ... + + # note: emb_dim = (d_M - 1) * m + 1 + mat_X = np.array([data[j : j + emb_dim : m] for j in indices]) + mat_X -= data[i : i + emb_dim : m] + + # build vector beta for linear least squares + # x_j1+(d_M)m - x_i+(d_M)m + # x_j2+(d_M)m - x_i+(d_M)m + # ... + if max(np.max(indices), i) + matrix_dim * m >= len(data): + assert len(data) < min_len + msg = ( + "Not enough data points. Cannot follow orbit vector {} for " + "{} (matrix_dim * m) time steps. Input must have at least " + "length {}." + ) + raise ValueError(msg.format(i, matrix_dim * m, min_len)) + vec_beta = data[indices + matrix_dim * m] - data[i + matrix_dim * m] + + # perform linear least squares + a, _, _, _ = np.linalg.lstsq(mat_X, vec_beta, rcond=-1) + # build matrix T + # 0 1 0 ... 0 + # 0 0 1 ... 0 + # ... + # 0 0 0 ... 1 + # a1 a2 a3 ... a_(d_M) + mat_T = np.zeros((matrix_dim, matrix_dim)) + mat_T[:-1, 1:] = np.identity(matrix_dim - 1) + mat_T[-1] = a + + # QR-decomposition of T * old_Q + mat_Q, mat_R = np.linalg.qr(np.dot(mat_T, old_Q)) + # force diagonal of R to be positive + # (if QR = A then also QLL'R = A with L' = L^-1) + sign_diag = np.sign(np.diag(mat_R)) + sign_diag[np.where(sign_diag == 0)] = 1 + sign_diag = np.diag(sign_diag) + mat_Q = np.dot(mat_Q, sign_diag) + mat_R = np.dot(sign_diag, mat_R) + + old_Q = mat_Q + # successively build sum for Lyapunov exponents + diag_R = np.diag(mat_R) + # filter zeros in mat_R (would lead to -infs) + idx = np.where(diag_R > 0) + lexp_i = np.zeros(diag_R.shape, dtype=np.float64) + lexp_i[idx] = np.log(diag_R[idx]) + lexp_i[np.where(diag_R == 0)] = np.inf + if debug_plot or debug_data: + debug_values.append(lexp_i / tau / m) + lexp[idx] += lexp_i[idx] + lexp_counts[idx] += 1 + # end of loop over orbit vectors + # it may happen that all R-matrices contained zeros => exponent really has + # to be -inf + if debug_plot: + plot_histogram_matrix(np.array(debug_values), "layp_e", fname=plot_file) + # normalize exponents over number of individual mat_Rs + idx = np.where(lexp_counts > 0) + lexp[idx] /= lexp_counts[idx] + lexp[np.where(lexp_counts == 0)] = np.inf + # normalize with respect to tau + lexp /= tau + # take m into account + lexp /= m + if debug_data: + return (lexp, np.array(debug_values)) + return lexp def plot_dists(dists, tolerance, m, title=None, fname=None) -> None: - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - nstd = 3 - nbins = 50 - dists_full = np.concatenate(dists) - ymax = len(dists_full) * 0.05 - mean = np.mean(dists_full) - std = np.std(dists_full, ddof=1) - rng = (0, mean + std * nstd) - i = 0 - colors = ["green", "blue"] - for h, bins in [np.histogram(dat, nbins, rng) for dat in dists]: - bw = bins[1] - bins[0] - plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", - color=colors[i], alpha=0.5) - i += 1 - plt.axvline(tolerance, color="red") - plt.legend(loc="best") - plt.xlabel("distance") - plt.ylabel("count") - plt.ylim(0, ymax) - if title is not None: - plt.title(title) - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def sampen(data, emb_dim=2, tolerance=None, lag=1, dist=rowwise_chebyshev, - closed=False, debug_plot=False, debug_data=False, plot_file=None): - """Computes the sample entropy of the given data. - - Explanation of the sample entropy: - The sample entropy of a time series is defined as the negative natural - logarithm of the conditional probability that two sequences similar for - emb_dim points remain similar at the next point, excluding self-matches. - - A lower value for the sample entropy therefore corresponds to a higher - probability indicating more self-similarity. - - Explanation of the algorithm: - The algorithm constructs all subsequences of length emb_dim - [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j - where dist(s_i, s_j) < tolerance. The same process is repeated for all - subsequences of length emb_dim + 1. The sum of similar sequence pairs - with length emb_dim + 1 is divided by the sum of similar sequence pairs - with length emb_dim. The result of the algorithm is the negative logarithm - of this ratio/probability. - - References: - .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series - analysis using approximate entropy and sample entropy,” - American Journal of Physiology-Heart and Circulatory Physiology, - vol. 278, no. 6, pp. H2039–H2049, 2000. - - Reference code: - .. [se_a] "sample_entropy" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf - - Args: - data (array-like of float): - input data - - Kwargs: - emb_dim (int): - the embedding dimension (length of vectors to compare) - tolerance (float): - distance threshold for two template vectors to be considered equal - (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect - for other values of emb_dim) - lag (int): - delay for the delay embedding - dist (function (2d-array, 1d-array) -> 1d-array): - distance function used to calculate the distance between template - vectors. Sampen is defined using ``rowwise_chebyshev``. You should only - use something else, if you are sure that you need it. - closed (boolean): - if True, will check for vector pairs whose distance is in the closed - interval [0, r] (less or equal to r), otherwise the open interval - [0, r) (less than r) will be used - debug_plot (boolean): - if True, a histogram of the individual distances for m and m+1 - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float: - the sample entropy of the data (negative logarithm of ratio between - similar template vectors of length emb_dim + 1 and emb_dim) - [c_m, c_m1]: - list of two floats: count of similar template vectors of length emb_dim - (c_m) and of length emb_dim + 1 (c_m1) - [float list, float list]: - Lists of lists of the form ``[dists_m, dists_m1]`` containing the - distances between template vectors for m (dists_m) - and for m + 1 (dists_m1). - """ - data = np.asarray(data) - - if tolerance is None: - # the reasoning behind this default value is the following: - # 1. physionet uses the default values emb_dim = 2, tolerance = 0.2 - # 2. the chebyshev distance rises logarithmically with increasing dimension - # 3. 0.5627 * np.log(emb_dim) + 1.3334 is the logarithmic trend line for - # the chebyshev distance of vectors sampled from a univariate normal - # distribution - # 4. 0.1164 is used as a factor to ensure that tolerance == std * 0.2 for - # emb_dim == 2 - tolerance = np.std(data, ddof=1) * 0.1164 * (0.5627 * np.log(emb_dim) + 1.3334) - n = len(data) - - # build matrix of "template vectors" - # (all consecutive subsequences of length m) - # x0 x1 x2 x3 ... xm-1 - # x1 x2 x3 x4 ... xm - # x2 x3 x4 x5 ... xm+1 - # ... - # x_n-m-1 ... xn-1 - - # since we need two of these matrices for m = emb_dim and m = emb_dim +1, - # we build one that is large enough => shape (emb_dim+1, n-emb_dim) - - # note that we ignore the last possible template vector with length emb_dim, - # because this vector has no corresponding vector of length m+1 and thus does - # not count towards the conditional probability - # (otherwise first dimension would be n-emb_dim+1 and not n-emb_dim) - tVecs = delay_embedding(np.asarray(data), emb_dim+1, lag=lag) - plot_data = [] - counts = [] - for m in [emb_dim, emb_dim + 1]: - counts.append(0) - plot_data.append([]) - # get the matrix that we need for the current m - tVecsM = tVecs[:n - m + 1, :m] - # successively calculate distances between each pair of template vectors - for i in range(len(tVecsM) - 1): - dsts = dist(tVecsM[i + 1:], tVecsM[i]) - if debug_plot or debug_data: - plot_data[-1].extend(dsts) - # count how many distances are smaller than the tolerance - if closed: - counts[-1] += np.sum(dsts <= tolerance) - else: - counts[-1] += np.sum(dsts < tolerance) - if counts[0] > 0 and counts[1] > 0: - saen = -np.log(1.0 * counts[1] / counts[0]) - else: - # log would be infinite or undefined => cannot determine saen - zcounts = [] - if counts[0] == 0: - zcounts.append("emb_dim") - if counts[1] == 0: - zcounts.append("emb_dim + 1") - warnings.warn( - ( - "Zero vectors are within tolerance for {}. " \ - "Consider raising the tolerance parameter to avoid {} result." - ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), - RuntimeWarning, stacklevel=2, - ) - if counts[0] == 0 and counts[1] == 0: - saen = np.nan - elif counts[0] == 0: - saen = -np.inf + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + nstd = 3 + nbins = 50 + dists_full = np.concatenate(dists) + ymax = len(dists_full) * 0.05 + mean = np.mean(dists_full) + std = np.std(dists_full, ddof=1) + rng = (0, mean + std * nstd) + i = 0 + colors = ["green", "blue"] + for h, bins in [np.histogram(dat, nbins, rng) for dat in dists]: + bw = bins[1] - bins[0] + plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) + i += 1 + plt.axvline(tolerance, color="red") + plt.legend(loc="best") + plt.xlabel("distance") + plt.ylabel("count") + plt.ylim(0, ymax) + if title is not None: + plt.title(title) + if fname is None: + plt.show() else: - saen = np.inf - if debug_plot: - plot_dists(plot_data, tolerance, m, title=f"sampEn = {saen:.3f}", - fname=plot_file) - if debug_data: - return (saen, counts, plot_data) - return saen + plt.savefig(fname) + plt.close() + + +def sampen( + data, + emb_dim=2, + tolerance=None, + lag=1, + dist=rowwise_chebyshev, + closed=False, + debug_plot=False, + debug_data=False, + plot_file=None, +): + """Computes the sample entropy of the given data. + + Explanation of the sample entropy: + The sample entropy of a time series is defined as the negative natural + logarithm of the conditional probability that two sequences similar for + emb_dim points remain similar at the next point, excluding self-matches. + + A lower value for the sample entropy therefore corresponds to a higher + probability indicating more self-similarity. + + Explanation of the algorithm: + The algorithm constructs all subsequences of length emb_dim + [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j + where dist(s_i, s_j) < tolerance. The same process is repeated for all + subsequences of length emb_dim + 1. The sum of similar sequence pairs + with length emb_dim + 1 is divided by the sum of similar sequence pairs + with length emb_dim. The result of the algorithm is the negative logarithm + of this ratio/probability. + + References: + .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series + analysis using approximate entropy and sample entropy,” + American Journal of Physiology-Heart and Circulatory Physiology, + vol. 278, no. 6, pp. H2039–H2049, 2000. + + Reference code: + .. [se_a] "sample_entropy" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + + Args: + data (array-like of float): + input data + + Kwargs: + emb_dim (int): + the embedding dimension (length of vectors to compare) + tolerance (float): + distance threshold for two template vectors to be considered equal + (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect + for other values of emb_dim) + lag (int): + delay for the delay embedding + dist (function (2d-array, 1d-array) -> 1d-array): + distance function used to calculate the distance between template + vectors. Sampen is defined using ``rowwise_chebyshev``. You should only + use something else, if you are sure that you need it. + closed (boolean): + if True, will check for vector pairs whose distance is in the closed + interval [0, r] (less or equal to r), otherwise the open interval + [0, r) (less than r) will be used + debug_plot (boolean): + if True, a histogram of the individual distances for m and m+1 + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + float: + the sample entropy of the data (negative logarithm of ratio between + similar template vectors of length emb_dim + 1 and emb_dim) + [c_m, c_m1]: + list of two floats: count of similar template vectors of length emb_dim + (c_m) and of length emb_dim + 1 (c_m1) + [float list, float list]: + Lists of lists of the form ``[dists_m, dists_m1]`` containing the + distances between template vectors for m (dists_m) + and for m + 1 (dists_m1). + """ + data = np.asarray(data) + + if tolerance is None: + # the reasoning behind this default value is the following: + # 1. physionet uses the default values emb_dim = 2, tolerance = 0.2 + # 2. the chebyshev distance rises logarithmically with increasing dimension + # 3. 0.5627 * np.log(emb_dim) + 1.3334 is the logarithmic trend line for + # the chebyshev distance of vectors sampled from a univariate normal + # distribution + # 4. 0.1164 is used as a factor to ensure that tolerance == std * 0.2 for + # emb_dim == 2 + tolerance = np.std(data, ddof=1) * 0.1164 * (0.5627 * np.log(emb_dim) + 1.3334) + n = len(data) + + # build matrix of "template vectors" + # (all consecutive subsequences of length m) + # x0 x1 x2 x3 ... xm-1 + # x1 x2 x3 x4 ... xm + # x2 x3 x4 x5 ... xm+1 + # ... + # x_n-m-1 ... xn-1 + + # since we need two of these matrices for m = emb_dim and m = emb_dim +1, + # we build one that is large enough => shape (emb_dim+1, n-emb_dim) + + # note that we ignore the last possible template vector with length emb_dim, + # because this vector has no corresponding vector of length m+1 and thus does + # not count towards the conditional probability + # (otherwise first dimension would be n-emb_dim+1 and not n-emb_dim) + tVecs = delay_embedding(np.asarray(data), emb_dim + 1, lag=lag) + plot_data = [] + counts = [] + for m in [emb_dim, emb_dim + 1]: + counts.append(0) + plot_data.append([]) + # get the matrix that we need for the current m + tVecsM = tVecs[: n - m + 1, :m] + # successively calculate distances between each pair of template vectors + for i in range(len(tVecsM) - 1): + dsts = dist(tVecsM[i + 1 :], tVecsM[i]) + if debug_plot or debug_data: + plot_data[-1].extend(dsts) + # count how many distances are smaller than the tolerance + if closed: + counts[-1] += np.sum(dsts <= tolerance) + else: + counts[-1] += np.sum(dsts < tolerance) + if counts[0] > 0 and counts[1] > 0: + saen = -np.log(1.0 * counts[1] / counts[0]) + else: + # log would be infinite or undefined => cannot determine saen + zcounts = [] + if counts[0] == 0: + zcounts.append("emb_dim") + if counts[1] == 0: + zcounts.append("emb_dim + 1") + warnings.warn( + ( + "Zero vectors are within tolerance for {}. " + "Consider raising the tolerance parameter to avoid {} result." + ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), + RuntimeWarning, + stacklevel=2, + ) + if counts[0] == 0 and counts[1] == 0: + saen = np.nan + elif counts[0] == 0: + saen = -np.inf + else: + saen = np.inf + if debug_plot: + plot_dists(plot_data, tolerance, m, title=f"sampEn = {saen:.3f}", fname=plot_file) + if debug_data: + return (saen, counts, plot_data) + return saen def binary_n(total_N, min_n=50): - """Creates a list of values by successively halving the total length total_N - until the resulting value is less than min_n. + """Creates a list of values by successively halving the total length total_N + until the resulting value is less than min_n. - Non-integer results are rounded down. + Non-integer results are rounded down. - Args: - total_N (int): - total length - Kwargs: - min_n (int): - minimal length after division + Args: + total_N (int): + total length + Kwargs: + min_n (int): + minimal length after division - Returns: - list of integers: - total_N/2, total_N/4, total_N/8, ... until total_N/2^i < min_n - """ - max_exp = np.log2(1.0 * total_N / min_n) - max_exp = int(np.floor(max_exp)) - return [int(np.floor(1.0 * total_N / (2**i))) for i in range(1, max_exp + 1)] + Returns: + list of integers: + total_N/2, total_N/4, total_N/8, ... until total_N/2^i < min_n + """ + max_exp = np.log2(1.0 * total_N / min_n) + max_exp = int(np.floor(max_exp)) + return [int(np.floor(1.0 * total_N / (2**i))) for i in range(1, max_exp + 1)] def logarithmic_n(min_n, max_n, factor): - """Creates a list of values by successively multiplying a minimum value min_n by - a factor > 1 until a maximum value max_n is reached. - - Non-integer results are rounded down. - - Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) - - Returns: - list of integers: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n - without duplicates - """ - assert max_n > min_n - assert factor > 1 - # stop condition: min * f^x = max - # => f^x = max/min - # => x = log(max/min) / log(f) - max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) - ns = [min_n] - for i in range(max_i + 1): - n = int(np.floor(min_n * (factor ** i))) - if n > ns[-1]: - ns.append(n) - return ns - - -def logmid_n(max_n, ratio=1/4.0, nsteps=15): - """Creates an array of integers that lie evenly spaced in the "middle" of the - logarithmic scale from 0 to log(max_n). - - If max_n is very small and/or nsteps is very large, this may lead to - duplicate values which will be removed from the output. - - This function has benefits in hurst_rs, because it cuts away both very small - and very large n, which both can cause problems, and still produces a - logarithmically spaced sequence. - - Args: - max_n (int): - largest possible output value (should be the sequence length when used in - hurst_rs) - - Kwargs: - ratio (float): - width of the "middle" of the logarithmic interval relative to log(max_n). - For example, for ratio=1/2.0 the logarithm of the resulting values will - lie between 0.25 * log(max_n) and 0.75 * log(max_n). - nsteps (float): - (maximum) number of values to take from the specified range - - Returns: - array of int: - a logarithmically spaced sequence of at most nsteps values (may be less, - because only unique values are returned) - """ - l = np.log(max_n) - span = l * ratio - start = l * (1 - ratio) * 0.5 - midrange = start + 1.0*np.arange(nsteps)/nsteps*span - nvals = np.round(np.exp(midrange)).astype("int32") - return np.unique(nvals) + """Creates a list of values by successively multiplying a minimum value min_n by + a factor > 1 until a maximum value max_n is reached. + + Non-integer results are rounded down. + + Args: + min_n (float): + minimum value (must be < max_n) + max_n (float): + maximum value (must be > min_n) + factor (float): + factor used to increase min_n (must be > 1) + + Returns: + list of integers: + min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n + without duplicates + """ + assert max_n > min_n + assert factor > 1 + # stop condition: min * f^x = max + # => f^x = max/min + # => x = log(max/min) / log(f) + max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) + ns = [min_n] + for i in range(max_i + 1): + n = int(np.floor(min_n * (factor**i))) + if n > ns[-1]: + ns.append(n) + return ns + + +def logmid_n(max_n, ratio=1 / 4.0, nsteps=15): + """Creates an array of integers that lie evenly spaced in the "middle" of the + logarithmic scale from 0 to log(max_n). + + If max_n is very small and/or nsteps is very large, this may lead to + duplicate values which will be removed from the output. + + This function has benefits in hurst_rs, because it cuts away both very small + and very large n, which both can cause problems, and still produces a + logarithmically spaced sequence. + + Args: + max_n (int): + largest possible output value (should be the sequence length when used in + hurst_rs) + + Kwargs: + ratio (float): + width of the "middle" of the logarithmic interval relative to log(max_n). + For example, for ratio=1/2.0 the logarithm of the resulting values will + lie between 0.25 * log(max_n) and 0.75 * log(max_n). + nsteps (float): + (maximum) number of values to take from the specified range + + Returns: + array of int: + a logarithmically spaced sequence of at most nsteps values (may be less, + because only unique values are returned) + """ + l = np.log(max_n) + span = l * ratio + start = l * (1 - ratio) * 0.5 + midrange = start + 1.0 * np.arange(nsteps) / nsteps * span + nvals = np.round(np.exp(midrange)).astype("int32") + return np.unique(nvals) def logarithmic_r(min_n, max_n, factor): - """Creates a list of values by successively multiplying a minimum value min_n by - a factor > 1 until a maximum value max_n is reached. - - Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) - - Returns: - list of floats: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n - """ - assert max_n > min_n - assert factor > 1 - max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) - return [min_n * (factor ** i) for i in range(max_i + 1)] + """Creates a list of values by successively multiplying a minimum value min_n by + a factor > 1 until a maximum value max_n is reached. + + Args: + min_n (float): + minimum value (must be < max_n) + max_n (float): + maximum value (must be > min_n) + factor (float): + factor used to increase min_n (must be > 1) + + Returns: + list of floats: + min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n + """ + assert max_n > min_n + assert factor > 1 + max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) + return [min_n * (factor**i) for i in range(max_i + 1)] def expected_rs(n): - """Calculates the expected (R/S)_n for white noise for a given n. + """Calculates the expected (R/S)_n for white noise for a given n. - This is used as a correction factor in the function hurst_rs. It uses the - formula of Anis-Lloyd-Peters (see [h_3]_). + This is used as a correction factor in the function hurst_rs. It uses the + formula of Anis-Lloyd-Peters (see [h_3]_). - Args: - n (int): - the value of n for which the expected (R/S)_n should be calculated + Args: + n (int): + the value of n for which the expected (R/S)_n should be calculated - Returns: - float: - expected (R/S)_n for white noise - """ - front = (n - 0.5) / n - i = np.arange(1, n) - back = np.sum(np.sqrt((n - i) / i)) - if n <= 340: - middle = math.gamma((n-1) * 0.5) / math.sqrt(math.pi) / math.gamma(n * 0.5) - else: - middle = 1.0 / math.sqrt(n * math.pi * 0.5) - return front * middle * back + Returns: + float: + expected (R/S)_n for white noise + """ + front = (n - 0.5) / n + i = np.arange(1, n) + back = np.sum(np.sqrt((n - i) / i)) + if n <= 340: + middle = math.gamma((n - 1) * 0.5) / math.sqrt(math.pi) / math.gamma(n * 0.5) + else: + middle = 1.0 / math.sqrt(n * math.pi * 0.5) + return front * middle * back def expected_h(nvals, fit="RANSAC"): - """Uses expected_rs to calculate the expected value for the Hurst exponent h - based on the values of n used for the calculation. - - Args: - nvals (iterable of int): - the values of n used to calculate the individual (R/S)_n - - KWargs: - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - - Returns: - float: - expected h for white noise - """ - rsvals = [expected_rs(n) for n in nvals] - poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit) - return poly[0] + """Uses expected_rs to calculate the expected value for the Hurst exponent h + based on the values of n used for the calculation. + + Args: + nvals (iterable of int): + the values of n used to calculate the individual (R/S)_n + + KWargs: + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + + Returns: + float: + expected h for white noise + """ + rsvals = [expected_rs(n) for n in nvals] + poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit) + return poly[0] def rs(data, n, unbiased=True): - """Calculates an individual R/S value in the rescaled range approach for - a given n. - - Note: This is just a helper function for hurst_rs and should not be called - directly. - - Args: - data (array-like of float): - time series - n (float): - size of the subseries in which data should be split - - Kwargs: - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. - - Returns: - float: - (R/S)_n - """ - data = np.asarray(data) - total_N = len(data) - m = total_N // n # number of sequences - # cut values at the end of data to make the array divisible by n - data = data[:total_N - (total_N % n)] - # split remaining data into subsequences of length n - seqs = np.reshape(data, (m, n)) - # calculate means of subsequences - means = np.mean(seqs, axis=1) - # normalize subsequences by substracting mean - y = seqs - means.reshape((m, 1)) - # build cumulative sum of subsequences - y = np.cumsum(y, axis=1) - # find ranges - r = np.max(y, axis=1) - np.min(y, axis=1) - # find standard deviation - # we should use the unbiased estimator, since we do not know the true mean - s = np.std(seqs, axis=1, ddof=1 if unbiased else 0) - # some ranges may be zero and have to be excluded from the analysis - idx = np.where(r != 0) - r = r[idx] - s = s[idx] - # it may happen that all ranges are zero (if all values in data are equal) - if len(r) == 0: - return np.nan - # return mean of r/s along subsequence index - return np.mean(r / s) + """Calculates an individual R/S value in the rescaled range approach for + a given n. + + Note: This is just a helper function for hurst_rs and should not be called + directly. + + Args: + data (array-like of float): + time series + n (float): + size of the subseries in which data should be split + + Kwargs: + unbiased (boolean): + if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. + + Returns: + float: + (R/S)_n + """ + data = np.asarray(data) + total_N = len(data) + m = total_N // n # number of sequences + # cut values at the end of data to make the array divisible by n + data = data[: total_N - (total_N % n)] + # split remaining data into subsequences of length n + seqs = np.reshape(data, (m, n)) + # calculate means of subsequences + means = np.mean(seqs, axis=1) + # normalize subsequences by substracting mean + y = seqs - means.reshape((m, 1)) + # build cumulative sum of subsequences + y = np.cumsum(y, axis=1) + # find ranges + r = np.max(y, axis=1) - np.min(y, axis=1) + # find standard deviation + # we should use the unbiased estimator, since we do not know the true mean + s = np.std(seqs, axis=1, ddof=1 if unbiased else 0) + # some ranges may be zero and have to be excluded from the analysis + idx = np.where(r != 0) + r = r[idx] + s = s[idx] + # it may happen that all ranges are zero (if all values in data are equal) + if len(r) == 0: + return np.nan + # return mean of r/s along subsequence index + return np.mean(r / s) def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None) -> None: - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - nhists = len(data[0]) - nbins = 25 - ylim = (0, 0.5) - nrows = int(np.ceil(np.sqrt(nhists))) - plt.figure(figsize=(nrows * 4, nrows * 4)) - for i in range(nhists): - plt.subplot(nrows, nrows, i + 1) - absmax = max(abs(np.max(data[:, i])), abs(np.min(data[:, i]))) - if bin_range == "absmax": - rng = (-absmax, absmax) - elif bin_range.endswith("sigma"): - n = int(bin_range[:-len("sigma")]) - mu = np.mean(data[:,i]) - sigma = np.std(data[:, i], ddof=1) - rng = (mu - n * sigma, mu + n * sigma) - h, bins = np.histogram(data[:, i], nbins, rng) - bin_width = bins[1] - bins[0] - h = h.astype(np.float64) / np.sum(h) - plt.bar(bins[:-1], h, bin_width) - plt.axvline(np.mean(data[:, i]), color="red") - plt.ylim(ylim) - plt.title(f"{name:s}[{i:d}]") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg(xvals, yvals, poly, x_label="x", y_label="y", data_label="data", - reg_label="regression line", fname=None) -> None: - """Helper function to plot trend lines for line-fitting approaches. This - function will show a plot through ``plt.show()`` and close it after the - window has been closed by the user. - - Args: - xvals (list/array of float): - list of x-values - yvals (list/array of float): - list of y-values - poly (list/array of float): - polynomial parameters as accepted by ``np.polyval`` - Kwargs: - x_label (str): - label of the x-axis - y_label (str): - label of the y-axis - data_label (str): - label of the data - reg_label(str): - label of the regression line - fname (str): - file name (if not None, the plot will be saved to disc instead of - showing it though ``plt.show()``) - """ - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - plt.plot(xvals, yvals, "bo", label=data_label) - if poly is not None: - plt.plot(xvals, np.polyval(poly, xvals), "r-", label=reg_label) - plt.xlabel(x_label) - plt.ylabel(y_label) - plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg_tiled(xvals, yvals, polys, x_label="x", y_label="y", - data_labels=None, reg_labels=None, fname=None, - columns=None) -> None: - """TODO.""" - # local import to avoid dependency for non-debug use - import matplotlib.pyplot as plt - max_span = max([np.max(y) - np.min(y) for y in yvals]) - means = [np.mean(y) for y in yvals] - if columns is None: - columns = min(4, int(np.ceil(np.sqrt(len(xvals))))) - if data_labels is None: - data_labels = ["data"] * len(xvals) - if reg_labels is None: - reg_labels = ["regression line"] * len(xvals) - for i in range(len(xvals)): - plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) - plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) - if polys is not None: - plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + nhists = len(data[0]) + nbins = 25 + ylim = (0, 0.5) + nrows = int(np.ceil(np.sqrt(nhists))) + plt.figure(figsize=(nrows * 4, nrows * 4)) + for i in range(nhists): + plt.subplot(nrows, nrows, i + 1) + absmax = max(abs(np.max(data[:, i])), abs(np.min(data[:, i]))) + if bin_range == "absmax": + rng = (-absmax, absmax) + elif bin_range.endswith("sigma"): + n = int(bin_range[: -len("sigma")]) + mu = np.mean(data[:, i]) + sigma = np.std(data[:, i], ddof=1) + rng = (mu - n * sigma, mu + n * sigma) + h, bins = np.histogram(data[:, i], nbins, rng) + bin_width = bins[1] - bins[0] + h = h.astype(np.float64) / np.sum(h) + plt.bar(bins[:-1], h, bin_width) + plt.axvline(np.mean(data[:, i]), color="red") + plt.ylim(ylim) + plt.title(f"{name:s}[{i:d}]") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def plot_reg( + xvals, + yvals, + poly, + x_label="x", + y_label="y", + data_label="data", + reg_label="regression line", + fname=None, +) -> None: + """Helper function to plot trend lines for line-fitting approaches. This + function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user. + + Args: + xvals (list/array of float): + list of x-values + yvals (list/array of float): + list of y-values + poly (list/array of float): + polynomial parameters as accepted by ``np.polyval`` + Kwargs: + x_label (str): + label of the x-axis + y_label (str): + label of the y-axis + data_label (str): + label of the data + reg_label(str): + label of the regression line + fname (str): + file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + """ + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + plt.plot(xvals, yvals, "bo", label=data_label) + if poly is not None: + plt.plot(xvals, np.polyval(poly, xvals), "r-", label=reg_label) plt.xlabel(x_label) plt.ylabel(y_label) - plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def plot_reg_multiple(xvals, yvals, polys, x_label="x", y_label="y", - data_labels=None, reg_labels=None, fname=None) -> None: - """TODO.""" - import matplotlib.pyplot as plt - if data_labels is None: - data_labels = ["data"] * len(xvals) - if reg_labels is None: - reg_labels = ["regression line"] * len(xvals) - for i in range(len(xvals)): - plt.plot(xvals[i], yvals[i], "+", label=data_labels[i]) - if polys is not None: - plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), label=reg_labels[i]) - plt.xlabel(x_label) - plt.ylabel(y_label) - plt.legend(loc="best") - if fname is None: - plt.show() - else: - plt.savefig(fname) - plt.close() - - -def hurst_rs(data, nvals=None, fit="RANSAC", debug_plot=False, - debug_data=False, plot_file=None, corrected=True, unbiased=True): - """Calculates the Hurst exponent by a standard rescaled range (R/S) approach. - - Explanation of Hurst exponent: - The Hurst exponent is a measure for the "long-term memory" of a - time series, meaning the long statistical dependencies in the data that do - not originate from cycles. - - It originates from H.E. Hursts observations of the problem of long-term - storage in water reservoirs. If x_i is the discharge of a river in year i - and we observe this discharge for N years, we can calculate the storage - capacity that would be required to keep the discharge steady at its mean - value. - - To do so, we first subtract the mean over all x_i from the individual - x_i to obtain the departures x'_i from the mean for each year i. As the - excess or deficit in discharge always carries over from year i to year i+1, - we need to examine the cumulative sum of x'_i, denoted by y_i. This - cumulative sum represents the filling of our hypothetical storage. If the - sum is above 0, we are storing excess discharge from the river, if it is - below zero we have compensated a deficit in discharge by releasing - water from the storage. The range (maximum - minimum) R of y_i therefore - represents the total capacity required for the storage. - - Hurst showed that this value follows a steady trend for varying N if it - is normalized by the standard deviation sigma over the x_i. Namely he - obtained the following formula: - - R/sigma = (N/2)^K - - In this equation, K is called the Hurst exponent. Its value is 0.5 for - white noise, but becomes greater for time series that exhibit some positive - dependency on previous values. For negative dependencies it becomes less - than 0.5. - - Explanation of the algorithm: - The rescaled range (R/S) approach is directly derived from Hurst's - definition. The time series of length N is split into non-overlapping - subseries of length n. Then, R and S (S = sigma) are calculated for each - subseries and the mean is taken over all subseries yielding (R/S)_n. This - process is repeated for several lengths n. Finally, the exponent K is - obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). - - There seems to be no consensus how to chose the subseries lenghts n. - This function therefore leaves the choice to the user. The module provides - some utility functions for "typical" values: - - * binary_n: N/2, N/4, N/8, ... - * logarithmic_n: min_n, min_n * f, min_n * f^2, ... - - References: - .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” - International Association of Scientific Hydrology. Bulletin, vol. 1, - no. 3, pp. 13–27, 1956. - .. [h_2] H. E. Hurst, “A suggested statistical model of some time series - which occur in nature,” Nature, vol. 180, p. 494, 1957. - .. [h_3] R. Weron, “Estimating long-range dependence: finite sample - properties and confidence intervals,” Physica A: Statistical Mechanics - and its Applications, vol. 312, no. 1, pp. 285–299, 2002. - - Reference Code: - .. [h_a] "hurst" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf - - Note: Pracma yields several estimates of the Hurst exponent, which - are listed below. Unless otherwise stated they use the divisors - of the length of the sequence as n. The length is reduced by at - most 1% to find the value that has the most divisors. - - * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for - n = N. - * The "theoretical Hurst exponent" is the value that would be - expected of an uncorrected rescaled range approach for random - noise of the size of the input data. - * The "empirical Hurst exponent" is the uncorrected Hurst exponent - obtained by the rescaled range approach. - * The "corrected empirical Hurst exponent" is the - Anis-Lloyd-Peters corrected Hurst exponent, but with - sqrt(1/2 * pi * n) added to the (R/S)_n before the log. - * The "corrected R over S Hurst exponent" uses the R-function "lm" - instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... - by successively halving the subsequences (which means that some - subsequences may be one element longer than others). In contrast - to its name it does not use the Anis-Lloyd-Peters correction - factor. - - If you want to compare the output of pracma to the output of - nolds, the "empirical hurst exponent" is the only measure that - exactly corresponds to the Hurst measure implemented in nolds - (by choosing corrected=False, fit="poly" and employing the same - strategy for choosing n as the divisors of the (reduced) - sequence length). - .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst - exponent using R/S Analysis", - url: https://ideas.repec.org/c/wuu/hscode/m11003.html - - Note: When the same values for nvals are used and fit is set to - "poly", nolds yields exactly the same results as this - implementation. - .. [h_c] Bill Davidson, "Hurst exponent", - url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent - - Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - sizes of subseries to use - (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 - logarithmically spaced values in the medium 25% of the logarithmic range) - - Generally, the choice for n is a trade-off between the length and the - number of the subsequences that are used for the calculation of the - (R/S)_n. Very low values of n lead to high variance in the ``r`` and - ``s`` while very high values may leave too few subsequences that the mean - along them is still meaningful. Logarithmic spacing makes sense, because - it translates to even spacing in the log-log-plot. - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - corrected (boolean): - if True, the Anis-Lloyd-Peters correction factor will be applied to the - output according to the expected value for the individual (R/S)_n - (see [h_3]_) - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. - - Returns: - float: - estimated Hurst exponent K using a rescaled range approach (if K = 0.5 - there are no long-range correlations in the data, if K < 0.5 there are - negative long-range correlations, if K > 0.5 there are positive - long-range correlations) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, rsvals, poly)`` where ``nvals`` are the values used for log(n), - ``rsvals`` are the corresponding log((R/S)_n) and ``poly`` are the line - coefficients (``[slope, intercept]``) - """ - data = np.asarray(data) - total_N = len(data) - if nvals is None: - # chooses a default value for nvals that will give 15 logarithmically - # spaced datapoints leaning towards the middle of the logarithmic range - # (since both too small and too large n introduce too much variance) - nvals = logmid_n(total_N, ratio=1/4.0, nsteps=15) - # get individual values for (R/S)_n - rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals]) - # filter NaNs (zeros should not be possible, because if R is 0 then - # S is also zero) - not_nan = np.logical_not(np.isnan(rsvals)) - rsvals = rsvals[not_nan] - nvals = np.asarray(nvals)[not_nan] - # it may happen that no rsvals are left (if all values of data are the same) - if len(rsvals) == 0: - poly = [np.nan, np.nan] - if debug_plot: - warnings.warn( - "Cannot display debug plot, all (R/S)_n are NaN", - RuntimeWarning, stacklevel=2, - ) - else: - # fit a line to the logarithm of the obtained (R/S)_n - xvals = np.log(nvals) - yvals = np.log(rsvals) - if corrected: - yvals -= np.log([expected_rs(n) for n in nvals]) - poly = poly_fit(xvals, yvals, 1, fit=fit) - if debug_plot: - plot_reg(xvals, yvals, poly, "log(n)", "log((R/S)_n)", - fname=plot_file) - # account for correction if necessary - h = poly[0] + 0.5 if corrected else poly[0] - # return line slope (+ correction) as hurst exponent - if debug_data: - return (h, (np.log(nvals), np.log(rsvals), poly)) - return h + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def plot_reg_tiled( + xvals, + yvals, + polys, + x_label="x", + y_label="y", + data_labels=None, + reg_labels=None, + fname=None, + columns=None, +) -> None: + """TODO.""" + # local import to avoid dependency for non-debug use + import matplotlib.pyplot as plt + + max_span = max([np.max(y) - np.min(y) for y in yvals]) + means = [np.mean(y) for y in yvals] + if columns is None: + columns = min(4, int(np.ceil(np.sqrt(len(xvals))))) + if data_labels is None: + data_labels = ["data"] * len(xvals) + if reg_labels is None: + reg_labels = ["regression line"] * len(xvals) + for i in range(len(xvals)): + plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) + plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) + if polys is not None: + plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) + plt.xlabel(x_label) + plt.ylabel(y_label) + plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) + plt.legend(loc="best") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def plot_reg_multiple( + xvals, yvals, polys, x_label="x", y_label="y", data_labels=None, reg_labels=None, fname=None +) -> None: + """TODO.""" + import matplotlib.pyplot as plt + + if data_labels is None: + data_labels = ["data"] * len(xvals) + if reg_labels is None: + reg_labels = ["regression line"] * len(xvals) + for i in range(len(xvals)): + plt.plot(xvals[i], yvals[i], "+", label=data_labels[i]) + if polys is not None: + plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), label=reg_labels[i]) + plt.xlabel(x_label) + plt.ylabel(y_label) + plt.legend(loc="best") + if fname is None: + plt.show() + else: + plt.savefig(fname) + plt.close() + + +def hurst_rs( + data, + nvals=None, + fit="RANSAC", + debug_plot=False, + debug_data=False, + plot_file=None, + corrected=True, + unbiased=True, +): + """Calculates the Hurst exponent by a standard rescaled range (R/S) approach. + + Explanation of Hurst exponent: + The Hurst exponent is a measure for the "long-term memory" of a + time series, meaning the long statistical dependencies in the data that do + not originate from cycles. + + It originates from H.E. Hursts observations of the problem of long-term + storage in water reservoirs. If x_i is the discharge of a river in year i + and we observe this discharge for N years, we can calculate the storage + capacity that would be required to keep the discharge steady at its mean + value. + + To do so, we first subtract the mean over all x_i from the individual + x_i to obtain the departures x'_i from the mean for each year i. As the + excess or deficit in discharge always carries over from year i to year i+1, + we need to examine the cumulative sum of x'_i, denoted by y_i. This + cumulative sum represents the filling of our hypothetical storage. If the + sum is above 0, we are storing excess discharge from the river, if it is + below zero we have compensated a deficit in discharge by releasing + water from the storage. The range (maximum - minimum) R of y_i therefore + represents the total capacity required for the storage. + + Hurst showed that this value follows a steady trend for varying N if it + is normalized by the standard deviation sigma over the x_i. Namely he + obtained the following formula: + + R/sigma = (N/2)^K + + In this equation, K is called the Hurst exponent. Its value is 0.5 for + white noise, but becomes greater for time series that exhibit some positive + dependency on previous values. For negative dependencies it becomes less + than 0.5. + + Explanation of the algorithm: + The rescaled range (R/S) approach is directly derived from Hurst's + definition. The time series of length N is split into non-overlapping + subseries of length n. Then, R and S (S = sigma) are calculated for each + subseries and the mean is taken over all subseries yielding (R/S)_n. This + process is repeated for several lengths n. Finally, the exponent K is + obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). + + There seems to be no consensus how to chose the subseries lenghts n. + This function therefore leaves the choice to the user. The module provides + some utility functions for "typical" values: + + * binary_n: N/2, N/4, N/8, ... + * logarithmic_n: min_n, min_n * f, min_n * f^2, ... + + References: + .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” + International Association of Scientific Hydrology. Bulletin, vol. 1, + no. 3, pp. 13–27, 1956. + .. [h_2] H. E. Hurst, “A suggested statistical model of some time series + which occur in nature,” Nature, vol. 180, p. 494, 1957. + .. [h_3] R. Weron, “Estimating long-range dependence: finite sample + properties and confidence intervals,” Physica A: Statistical Mechanics + and its Applications, vol. 312, no. 1, pp. 285–299, 2002. + + Reference Code: + .. [h_a] "hurst" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + + Note: Pracma yields several estimates of the Hurst exponent, which + are listed below. Unless otherwise stated they use the divisors + of the length of the sequence as n. The length is reduced by at + most 1% to find the value that has the most divisors. + + * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for + n = N. + * The "theoretical Hurst exponent" is the value that would be + expected of an uncorrected rescaled range approach for random + noise of the size of the input data. + * The "empirical Hurst exponent" is the uncorrected Hurst exponent + obtained by the rescaled range approach. + * The "corrected empirical Hurst exponent" is the + Anis-Lloyd-Peters corrected Hurst exponent, but with + sqrt(1/2 * pi * n) added to the (R/S)_n before the log. + * The "corrected R over S Hurst exponent" uses the R-function "lm" + instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... + by successively halving the subsequences (which means that some + subsequences may be one element longer than others). In contrast + to its name it does not use the Anis-Lloyd-Peters correction + factor. + + If you want to compare the output of pracma to the output of + nolds, the "empirical hurst exponent" is the only measure that + exactly corresponds to the Hurst measure implemented in nolds + (by choosing corrected=False, fit="poly" and employing the same + strategy for choosing n as the divisors of the (reduced) + sequence length). + .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst + exponent using R/S Analysis", + url: https://ideas.repec.org/c/wuu/hscode/m11003.html + + Note: When the same values for nvals are used and fit is set to + "poly", nolds yields exactly the same results as this + implementation. + .. [h_c] Bill Davidson, "Hurst exponent", + url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent + + Args: + data (array-like of float): + time series + Kwargs: + nvals (iterable of int): + sizes of subseries to use + (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 + logarithmically spaced values in the medium 25% of the logarithmic range) + + Generally, the choice for n is a trade-off between the length and the + number of the subsequences that are used for the calculation of the + (R/S)_n. Very low values of n lead to high variance in the ``r`` and + ``s`` while very high values may leave too few subsequences that the mean + along them is still meaningful. Logarithmic spacing makes sense, because + it translates to even spacing in the log-log-plot. + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + corrected (boolean): + if True, the Anis-Lloyd-Peters correction factor will be applied to the + output according to the expected value for the individual (R/S)_n + (see [h_3]_) + unbiased (boolean): + if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. + + Returns: + float: + estimated Hurst exponent K using a rescaled range approach (if K = 0.5 + there are no long-range correlations in the data, if K < 0.5 there are + negative long-range correlations, if K > 0.5 there are positive + long-range correlations) + (1d-vector, 1d-vector, list): + only present if debug_data is True: debug data of the form + ``(nvals, rsvals, poly)`` where ``nvals`` are the values used for log(n), + ``rsvals`` are the corresponding log((R/S)_n) and ``poly`` are the line + coefficients (``[slope, intercept]``) + """ + data = np.asarray(data) + total_N = len(data) + if nvals is None: + # chooses a default value for nvals that will give 15 logarithmically + # spaced datapoints leaning towards the middle of the logarithmic range + # (since both too small and too large n introduce too much variance) + nvals = logmid_n(total_N, ratio=1 / 4.0, nsteps=15) + # get individual values for (R/S)_n + rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals]) + # filter NaNs (zeros should not be possible, because if R is 0 then + # S is also zero) + not_nan = np.logical_not(np.isnan(rsvals)) + rsvals = rsvals[not_nan] + nvals = np.asarray(nvals)[not_nan] + # it may happen that no rsvals are left (if all values of data are the same) + if len(rsvals) == 0: + poly = [np.nan, np.nan] + if debug_plot: + warnings.warn( + "Cannot display debug plot, all (R/S)_n are NaN", + RuntimeWarning, + stacklevel=2, + ) + else: + # fit a line to the logarithm of the obtained (R/S)_n + xvals = np.log(nvals) + yvals = np.log(rsvals) + if corrected: + yvals -= np.log([expected_rs(n) for n in nvals]) + poly = poly_fit(xvals, yvals, 1, fit=fit) + if debug_plot: + plot_reg(xvals, yvals, poly, "log(n)", "log((R/S)_n)", fname=plot_file) + # account for correction if necessary + h = poly[0] + 0.5 if corrected else poly[0] + # return line slope (+ correction) as hurst exponent + if debug_data: + return (h, (np.log(nvals), np.log(rsvals), poly)) + return h + # TODO implement MFDFA as second (more reliable) measure for multifractality # NOTE: probably not needed, since mfhurst_b is already pretty reliable -def mfhurst_b(data, qvals=None, dists=None, fit="poly", - debug_plot=False, debug_data=False, plot_file=None): - r"""Calculates the Generalized Hurst Exponent H_q for different q according to - A.-L. Barabási and T. Vicsek. - - Explanation of the Generalized Hurst Exponent: - The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) - be seen as a generalization of the Hurst exponent for data series with - multifractal properties. It's origins are however not directly related - to Hurst's rescaled range approach, but to the definition of self-affine - functions. - - A single-valued self-affine function h by definition satisfies the relation - - h(x) ~= lambda^(-H) h(lambda x) - - for any positive real valued lambda and some positive real valued exponent - H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent - in the literature. In other words you can view lambda as a scaling factor - or "step size". With lambda < 1 we decrease the step size and zoom into our - function. In this case lambda^(-H) becomes greater than one, meaning that - h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we - zoom out and get lambda^(-H) < 1. - - To calculate H, you can use the height-height correlation function (also - called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x - denotes the expected value over x. Here, the aforementioned self-affine - property is equivalent to c(d) ~ d^(2H). You can also think of d as a step - size. Increasing or decreasing d from 1 to some y is the same as setting - lambda = y: It increases or decreases the scale of the function by a factor - of 1/y^(-H) = y^H. Therefore the squared differences will be proportional - to y^2H. - - A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy - of exponents H_q for the qth-order correlation function with - - c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) - - With q = 1 you get a value H_1 that is closely related to the normal Hurst - exponent, but with different q you either get a constant value H_q = H_0 - independent of q, which indicates that the function has no multifractal - properties, or different H_q, which is a sign for multifractal behavior. - - T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to - financial data series and gave it the name "Generalized Hurst Exponent". - - Explanation of the Algorithm: - Curiously, I could not find any algorithmic description how to calculate - H_q in the literature. Researchers seem to just imply that you can obtain - the exponent by a line fitting algorithm in a log-log plot, but they do not - talk about the actual procedure or the required parameters. - - Essentially, we can calculate c_q(d) of a discrete evenly sampled time - series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences - [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to - the qth power and taking the mean. - - Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) - and get - - log(c_q(d)) ~ log(d) * q H_q - - So in other words if we plot log(c_q(d)) against log(d) for several d we - should get a straight line with slope q H_q. This enables us to use a - linear least squares algorithm to obtain H_q. - - Note that we consider x as a discrete variable in the range 0 <= x < N. - We can do this, because the actual sampling rate of our data series does - not alter the result. After taking the logarithm any scaling factor delta_x - would only result in an additive term since - log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope - of the line and not the intercept. - - References: - .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. - - Args: - data (array-like of float): - time series of data points (should be evenly sampled) - - Kwargs: - qvals (iterable of float or int): - values of q for which H_q should be calculated (default: [1]) - dists (iterable of int): - distances for which the height-height correlation should be calculated - (determines the x-coordinates in the log-log plot) - default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure - even spacing on the logarithmic axis - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - array of float: - list of H_q for every q given in ``qvals`` - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. - """ - # transform to array if necessary - data = np.asarray(data, dtype=np.float64) - if qvals is None: - # actual default parameter would introduce shared list - # see: http://pylint-messages.wikidot.com/messages:w0102 - qvals = [1] - if dists is None: - dists = logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) - dists = np.asarray(dists) - if len(data) < 60: - warnings.warn( - f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, - ) - - def hhcorr(d, q): - diffs = np.abs(data[:-d] - data[d:]) - diffs = diffs[np.where(diffs > 0)] - return np.mean(diffs ** q) - - # calculate height-height correlations - corrvals = [hhcorr(d, q) for d in dists for q in qvals] - corrvals = np.array(corrvals, dtype=np.float64) - corrvals = corrvals.reshape(len(dists), len(qvals)) - - # line fitting - xvals = np.log(dists) - yvals = np.log(corrvals) - polys = [ - poly_fit(xvals, yvals[:, qi], 1, fit=fit) - for qi in range(len(qvals)) - ] - H = np.array(polys)[:, 0] / qvals - if debug_plot: - plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], - [p / q for p, q in zip(polys, qvals)], - x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], - reg_labels=[f"reg. line (H = {h:.3f})" for h in H], - fname=plot_file, - ) - if debug_data: - return H, (xvals, yvals, polys) - return H +def mfhurst_b( + data, qvals=None, dists=None, fit="poly", debug_plot=False, debug_data=False, plot_file=None +): + r"""Calculates the Generalized Hurst Exponent H_q for different q according to + A.-L. Barabási and T. Vicsek. + + Explanation of the Generalized Hurst Exponent: + The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) + be seen as a generalization of the Hurst exponent for data series with + multifractal properties. It's origins are however not directly related + to Hurst's rescaled range approach, but to the definition of self-affine + functions. + + A single-valued self-affine function h by definition satisfies the relation + + h(x) ~= lambda^(-H) h(lambda x) + + for any positive real valued lambda and some positive real valued exponent + H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent + in the literature. In other words you can view lambda as a scaling factor + or "step size". With lambda < 1 we decrease the step size and zoom into our + function. In this case lambda^(-H) becomes greater than one, meaning that + h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we + zoom out and get lambda^(-H) < 1. + + To calculate H, you can use the height-height correlation function (also + called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x + denotes the expected value over x. Here, the aforementioned self-affine + property is equivalent to c(d) ~ d^(2H). You can also think of d as a step + size. Increasing or decreasing d from 1 to some y is the same as setting + lambda = y: It increases or decreases the scale of the function by a factor + of 1/y^(-H) = y^H. Therefore the squared differences will be proportional + to y^2H. + + A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy + of exponents H_q for the qth-order correlation function with + + c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) + + With q = 1 you get a value H_1 that is closely related to the normal Hurst + exponent, but with different q you either get a constant value H_q = H_0 + independent of q, which indicates that the function has no multifractal + properties, or different H_q, which is a sign for multifractal behavior. + + T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to + financial data series and gave it the name "Generalized Hurst Exponent". + + Explanation of the Algorithm: + Curiously, I could not find any algorithmic description how to calculate + H_q in the literature. Researchers seem to just imply that you can obtain + the exponent by a line fitting algorithm in a log-log plot, but they do not + talk about the actual procedure or the required parameters. + + Essentially, we can calculate c_q(d) of a discrete evenly sampled time + series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences + [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to + the qth power and taking the mean. + + Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) + and get + + log(c_q(d)) ~ log(d) * q H_q + + So in other words if we plot log(c_q(d)) against log(d) for several d we + should get a straight line with slope q H_q. This enables us to use a + linear least squares algorithm to obtain H_q. + + Note that we consider x as a discrete variable in the range 0 <= x < N. + We can do this, because the actual sampling rate of our data series does + not alter the result. After taking the logarithm any scaling factor delta_x + would only result in an additive term since + log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope + of the line and not the intercept. + + References: + .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + + Args: + data (array-like of float): + time series of data points (should be evenly sampled) + + Kwargs: + qvals (iterable of float or int): + values of q for which H_q should be calculated (default: [1]) + dists (iterable of int): + distances for which the height-height correlation should be calculated + (determines the x-coordinates in the log-log plot) + default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure + even spacing on the logarithmic axis + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + array of float: + list of H_q for every q given in ``qvals`` + (1d-vector, 2d-vector, 2d-vector): + only present if debug_data is True: debug data of the form + ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, + ``yvals`` are the logarithms of the corresponding height-height- + correlations for each distance (first dimension) and each q + (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are + the line coefficients (``[slope, intercept]``) for each q in the shape + len(qvals) x 2. + """ + # transform to array if necessary + data = np.asarray(data, dtype=np.float64) + if qvals is None: + # actual default parameter would introduce shared list + # see: http://pylint-messages.wikidot.com/messages:w0102 + qvals = [1] + if dists is None: + dists = logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) + dists = np.asarray(dists) + if len(data) < 60: + warnings.warn( + f"H(q) is not reliable for small time series ({len(data)} < 60)", + stacklevel=2, + ) + + def hhcorr(d, q): + diffs = np.abs(data[:-d] - data[d:]) + diffs = diffs[np.where(diffs > 0)] + return np.mean(diffs**q) + + # calculate height-height correlations + corrvals = [hhcorr(d, q) for d in dists for q in qvals] + corrvals = np.array(corrvals, dtype=np.float64) + corrvals = corrvals.reshape(len(dists), len(qvals)) + + # line fitting + xvals = np.log(dists) + yvals = np.log(corrvals) + polys = [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))] + H = np.array(polys)[:, 0] / qvals + if debug_plot: + plot_reg_multiple( + [xvals] * len(qvals), + [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], + [p / q for p, q in zip(polys, qvals)], + x_label="log(x)", + y_label="$\\log(c_q(x)) / q$", + data_labels=["q = %d" % q for q in qvals], + reg_labels=[f"reg. line (H = {h:.3f})" for h in H], + fname=plot_file, + ) + if debug_data: + return H, (xvals, yvals, polys) + return H def _genhurst(S, q): @@ -1581,603 +1665,624 @@ def _genhurst(S, q): k = 0 for Tmax in range(5, 20): - - x = np.arange(1, Tmax+1, 1) + x = np.arange(1, Tmax + 1, 1) mcord = np.zeros((Tmax, 1)) - for tt in range(1, Tmax+1): - dV = S[np.arange(tt, L, tt)] - S[np.arange(tt, L, tt)-tt] - VV = S[np.arange(tt, L+tt, tt)-tt] + for tt in range(1, Tmax + 1): + dV = S[np.arange(tt, L, tt)] - S[np.arange(tt, L, tt) - tt] + VV = S[np.arange(tt, L + tt, tt) - tt] N = len(dV) + 1 - X = np.arange(1, N+1, dtype=np.float64) + X = np.arange(1, N + 1, dtype=np.float64) Y = VV - mx = np.sum(X)/N - SSxx = np.sum(X**2) - N*mx**2 - my = np.sum(Y)/N - SSxy = np.sum(np.multiply(X, Y)) - N*mx*my - cc1 = SSxy/SSxx - cc2 = my - cc1*mx + mx = np.sum(X) / N + SSxx = np.sum(X**2) - N * mx**2 + my = np.sum(Y) / N + SSxy = np.sum(np.multiply(X, Y)) - N * mx * my + cc1 = SSxy / SSxx + cc2 = my - cc1 * mx ddVd = dV - cc1 - VVVd = VV - np.multiply(cc1, np.arange(1, N+1, dtype=np.float64)) \ - - cc2 - mcord[tt-1] = np.mean(np.abs(ddVd)**q)/np.mean(np.abs(VVVd)**q) + VVVd = VV - np.multiply(cc1, np.arange(1, N + 1, dtype=np.float64)) - cc2 + mcord[tt - 1] = np.mean(np.abs(ddVd) ** q) / np.mean(np.abs(VVVd) ** q) mx = np.mean(np.log10(x)) - SSxx = np.sum(np.log10(x)**2) - Tmax*mx**2 + SSxx = np.sum(np.log10(x) ** 2) - Tmax * mx**2 my = np.mean(np.log10(mcord)) - SSxy = np.sum( - np.multiply( - np.log10(x), np.transpose(np.log10(mcord)), - ), - ) - Tmax*mx*my - H[k] = SSxy/SSxx + SSxy = ( + np.sum( + np.multiply( + np.log10(x), + np.transpose(np.log10(mcord)), + ), + ) + - Tmax * mx * my + ) + H[k] = SSxy / SSxx k = k + 1 - return np.mean(H)/q - + return np.mean(H) / q def _aste_line_fit(x, y): - """Simple linear regression with ordinary least squares - https://en.wikipedia.org/wiki/Simple_linear_regression. - - NOTE: this function is left here to demonstrate the correctness of - T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same - results with a call to ``np.polyfit(x, y, 1)[::-1]``. - """ - # convert to float to avoid integer overflow problems - x = np.asarray(x, dtype=np.float64) - y = np.asarray(y, dtype=np.float64) - N = len(x) - mx = np.mean(x) - my = np.mean(y) - # calculate the variance in x - # sum((x - mx) ^ 2) = sum(x ^ 2) - 2 * sum(x * mx) + N * mx ^ 2 - # = sum(x ^ 2) - 2 * mx * sum(x) + N * mx ^ 2 - # = sum(x ^ 2) - 2 * mx * N * mx + N * mx ^ 2 - # = sum(x ^ 2) - N * mx ^ 2 - var = np.sum(x ** 2) - N * mx * mx - # corvariance of x and y - # sum((x - mx) * (y - my)) - # = sum(xy) - sum(mx * y) - sum(my * x) + N * mx * my - # = sum(xy) - mx * sum(y) - my * sum(x) + N * mx * my - # = sum(xy) - mx * my * N - my * mx * N + N * mx * my - # = sum(xy) - N * mx * my - # NOTE: T. Aste's code is a little confusing here - # X = 1:N; - # Y = S(((tt+1):tt:(L+tt))-tt)'; - # ... - # SSxy = sum(X.*Y) - N*mx*my; - # Here, Y is transposed and the multiplication for SSxy uses .* instead of *. - # This suggests that we have a matrix multiplication with (possible) - # broadcasting. If X was an array and not a range, we would have a NxN array - # as a result since size(X) = [1, N] and size(Y) = [N, 1]. Ranges behave - # differently in MATLAB and this is the only reason why we get the correct - # result here. - cov = np.sum(x * y) - N * mx * my - # calculate slope and intercept (this is correct again) - slope = cov / var - intercept = my - slope * mx - return [intercept, slope] - - -def mfhurst_dm(data, qvals=None, max_dists=range(5, 20), detrend=True, - fit="poly", debug_plot=False, debug_data=False, plot_file=None): - """Calculates the Generalized Hurst Exponent H_q for different q according to - the MATLAB code of Tomaso Aste - one of the authors that introduced this - measure. - - Explanation of the General Hurst Exponent: - See mfhurst_b. - - Warning: I do not recommend to use this function unless you want to reproduce - examples from Di Matteo et al.. From my experiments and a critical code - analysis it seems that mfhurst_b should provide more robust results. - - The design choices that make mfhurst_dm different than mfhurst_d are the - following: - - - By default, a linear trend is removed from the data. This can be sensible - in some application areas (such as stock market analysis), but I think - this should be an additional preprocessing step and not part of this - algorithm. - - In the calculation of the height-height correlations, the differences - (h(x) - h(x + d) are not calculated for every possible x from 0 to N-d-1, - but instead d is used as a step size for x. I see no justification for - this choice. It makes the algorithm run faster, but it also takes away - a lot of statistical robustness, especially for large values of d. - This effect can be clearly seen when setting `debug_plot` to `True`. - - The algorithm uses a linear scale for the distance values d = 1, 2, 3, - ..., tau_max. This is counter intuitive, since we later plot log(d) - against log(c_q(d)). A linear scale will have a bias towards larger - values in the logarithmic scale. A logarithmic scale for d seems to be - a more natural fit. If low values of d yield statistically unstable - results, they should simply be omitted. - - The algorithm tests multiple values for tau_max, which is the maximum - distance that will be calculated. In [mhd_1]_ the authors state that this - is done to test the robustness of the approach. However, taking the - mean of several runs with different tau_max will not produce any more - information than performing one run with the largest tau_max. Instead - it will only introduce a bias towards low values for d. - - References: - .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. - - Reference code: - .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", - url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent - - Args: - data (1d-vector of float): - input data (should be evenly sampled) - qvals (1d-vector of float) - values of q for which H_q should be calculated (default: [1]) - - Kwargs: - max_dists (1d-vector of int): - different values to test for tau_max, the maximum value for the distance - d. The resulting H_q will be a mean of all H_q calculated with tau_max - = max_dists[0], max_dists[1], ... . - detrend (boolean): - if True, a linear trend will be removed from the data before H_q will - be calculated - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - array of float: - array of mH_q for every q given in ``qvals`` where mH_q is the mean of - all H_q calculated for different max distances in max_dists. - array of float: - array of standard deviations sH_q for each mH_q returned - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. - """ - # transform to array if necessary - data = np.asarray(data) - if qvals is None: - # actual default parameter would introduce shared list - # see: http://pylint-messages.wikidot.com/messages:w0102 - qvals = [1] - if len(data) < 60: - warnings.warn( - f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, - ) - max_max_dist = np.max(max_dists) - hhcorr = [] - # NOTE: I don't think it's a good idea to use a linear scale for the distance - # values. Our fit is in logarithmic space, so this will place more weight on - # the higher distance. This is not bad per se, but if you think that the - # first values are unreliable, it would be better to skip them alltogether. - for dist in range(1, max_max_dist+1): - # NOTE: I don't think applying a step size to the input data is reasonable. - # I cannot find any justification for this in the papers and reduces the - # number of points that we can use to make our mean statistically stable. - step_size = dist - stepdata = data[::step_size] - if detrend: - stepdata = detrend_data(stepdata, order=1) - diffs = stepdata[1:] - stepdata[:-1] - hhcorr.append([ - np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) - for q in qvals - ]) - hhcorr = np.array(hhcorr, dtype=np.float64) - xvals = np.log(np.arange(1, max_max_dist+1)) - yvals = np.log(hhcorr) - # NOTE: Using several maximum distances seems to be a strange way to - # introduce stability, since it only places emphasis on the lower distance - # ranges and does not introduce any new information. - H = np.array([ - poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit)[0] - for qi in range(len(qvals)) - for md in max_dists - ], dtype=np.float64).reshape(len(qvals), len(max_dists)) - if debug_plot: - polys = [ - np.array(poly_fit(xvals, yvals[:, qi], 1)) / qvals[qi] - for qi in range(len(qvals)) - ] - plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], - polys, - x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], - reg_labels=[f"reg. line (H = {h:.3f})" for h in H[:, -1] / qvals], - fname=plot_file, - ) - mH = np.mean(H, axis=1) / qvals - sH = np.std(H, axis=1) / qvals - if debug_data: - return [mH, sH, (xvals, yvals, polys)] - return [mH, sH] - - -def corr_dim(data, emb_dim, lag=1, rvals=None, dist=rowwise_euclidean, - fit="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """Calculates the correlation dimension with the Grassberger-Procaccia algorithm. - - Explanation of correlation dimension: - The correlation dimension is a characteristic measure that can be used - to describe the geometry of chaotic attractors. It is defined using the - correlation sum C(r) which is the fraction of pairs of points X_i in the - phase space whose distance is smaller than r. - - If the relation between C(r) and r can be described by the power law - - C(r) ~ r^D - - then D is called the correlation dimension of the system. - - In a d-dimensional system, the maximum value for D is d. This value is - obtained for systems that expand uniformly in each dimension with time. - The lowest possible value is 0 for a system with constant C(r) (i.e. a - system that visits just one point in the phase space). Generally if D is - lower than d and the system has an attractor, this attractor is called - "strange" and D is a measure of this "strangeness". - - Explanation of the algorithm: - The Grassberger-Procaccia algorithm calculates C(r) for a range of - different r and then fits a straight line into the plot of log(C(r)) - versus log(r). - - This version of the algorithm is created for one-dimensional (scalar) time - series. Therefore, before calculating C(r), a delay embedding of the time - series is performed to yield emb_dim dimensional vectors - Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing - a higher value for emb_dim allows to reconstruct higher dimensional dynamics - and avoids "systematic errors due to corrections to scaling". Choosing a - higher value for lag allows to avoid overestimating correlation because - X_i ~= X_i+1, but it should also not be set too high to not underestimate - correlation due to exponential divergence of trajectories in chaotic systems. - - References: - .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange - attractors,” Physical review letters, vol. 50, no. 5, p. 346, - 1983. - .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of - strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” - Scholarpedia, vol. 2, no. 5, p. 3043. - urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm - - Reference Code: - .. [cd_a] "corrDim" function in R package "fractal", - url: https://cran.r-project.org/web/packages/fractal/fractal.pdf - .. [cd_b] Peng Yuehua, "Correlation dimension", - url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension - - Args: - data (array-like of float): - time series of data points - emb_dim (int): - embedding dimension - Kwargs: - rvals (iterable of float): - list of values for to use for r - (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) - dist (function (2d-array, 1d-array) -> 1d-array): - row-wise difference function - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - - Returns: - float: - correlation dimension as slope of the line fitted to log(r) vs log(C(r)) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(rvals, csums, poly)`` where ``rvals`` are the values used for log(r), - ``csums`` are the corresponding log(C(r)) and ``poly`` are the line - coefficients (``[slope, intercept]``) - """ - # TODO determine lag in units of time instead of number of datapoints - data = np.asarray(data) - - # TODO what are good values for r? - # TODO do this for multiple values of emb_dim? - if rvals is None: - sd = np.std(data, ddof=1) - rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) - orbit = delay_embedding(data, emb_dim, lag=lag) - n = len(orbit) - dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64) - for i in range(len(orbit)): - # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1 - # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches - # however, since both [cd_2] and [cd_3] specify to only compare i with j != i - # or j > i respectively, it is safe to assume that this was an oversight in - # [cd_1] - d = dist(orbit[i+1:], orbit[i]) - dists[i+1:,i] = d # fill column i - dists[i,i+1:] = d # fill row i - csums = [] - for r in rvals: - # NOTE: The [cd_1] and [cd_2] both use the factor 1/N^2 here. - # However, since we only use these values to fit a line in a log-log plot - # any multiplicative constant doesn't change the result since it will - # only result in an offset on the y-axis. Also, [cd_3] has a point here - # in that if we exclude self-matches in the numerator, it makes sense to - # also exclude self-matches from the denominator. - s = 1.0 / (n * (n - 1)) * np.sum(dists <= r) - csums.append(s) - csums = np.array(csums) - # filter zeros from csums - nonzero = np.where(csums != 0) - rvals = np.array(rvals)[nonzero] - csums = csums[nonzero] - if len(csums) == 0: - # all sums are zero => we cannot fit a line - poly = [np.nan, np.nan] - else: - poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) - if debug_plot: - plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", - fname=plot_file) - if debug_data: - return (poly[0], (np.log(rvals), np.log(csums), poly)) - return poly[0] + """Simple linear regression with ordinary least squares + https://en.wikipedia.org/wiki/Simple_linear_regression. + NOTE: this function is left here to demonstrate the correctness of + T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same + results with a call to ``np.polyfit(x, y, 1)[::-1]``. + """ + # convert to float to avoid integer overflow problems + x = np.asarray(x, dtype=np.float64) + y = np.asarray(y, dtype=np.float64) + N = len(x) + mx = np.mean(x) + my = np.mean(y) + # calculate the variance in x + # sum((x - mx) ^ 2) = sum(x ^ 2) - 2 * sum(x * mx) + N * mx ^ 2 + # = sum(x ^ 2) - 2 * mx * sum(x) + N * mx ^ 2 + # = sum(x ^ 2) - 2 * mx * N * mx + N * mx ^ 2 + # = sum(x ^ 2) - N * mx ^ 2 + var = np.sum(x**2) - N * mx * mx + # corvariance of x and y + # sum((x - mx) * (y - my)) + # = sum(xy) - sum(mx * y) - sum(my * x) + N * mx * my + # = sum(xy) - mx * sum(y) - my * sum(x) + N * mx * my + # = sum(xy) - mx * my * N - my * mx * N + N * mx * my + # = sum(xy) - N * mx * my + # NOTE: T. Aste's code is a little confusing here + # X = 1:N; + # Y = S(((tt+1):tt:(L+tt))-tt)'; + # ... + # SSxy = sum(X.*Y) - N*mx*my; + # Here, Y is transposed and the multiplication for SSxy uses .* instead of *. + # This suggests that we have a matrix multiplication with (possible) + # broadcasting. If X was an array and not a range, we would have a NxN array + # as a result since size(X) = [1, N] and size(Y) = [N, 1]. Ranges behave + # differently in MATLAB and this is the only reason why we get the correct + # result here. + cov = np.sum(x * y) - N * mx * my + # calculate slope and intercept (this is correct again) + slope = cov / var + intercept = my - slope * mx + return [intercept, slope] + + +def mfhurst_dm( + data, + qvals=None, + max_dists=range(5, 20), + detrend=True, + fit="poly", + debug_plot=False, + debug_data=False, + plot_file=None, +): + """Calculates the Generalized Hurst Exponent H_q for different q according to + the MATLAB code of Tomaso Aste - one of the authors that introduced this + measure. + + Explanation of the General Hurst Exponent: + See mfhurst_b. + + Warning: I do not recommend to use this function unless you want to reproduce + examples from Di Matteo et al.. From my experiments and a critical code + analysis it seems that mfhurst_b should provide more robust results. + + The design choices that make mfhurst_dm different than mfhurst_d are the + following: + + - By default, a linear trend is removed from the data. This can be sensible + in some application areas (such as stock market analysis), but I think + this should be an additional preprocessing step and not part of this + algorithm. + - In the calculation of the height-height correlations, the differences + (h(x) - h(x + d) are not calculated for every possible x from 0 to N-d-1, + but instead d is used as a step size for x. I see no justification for + this choice. It makes the algorithm run faster, but it also takes away + a lot of statistical robustness, especially for large values of d. + This effect can be clearly seen when setting `debug_plot` to `True`. + - The algorithm uses a linear scale for the distance values d = 1, 2, 3, + ..., tau_max. This is counter intuitive, since we later plot log(d) + against log(c_q(d)). A linear scale will have a bias towards larger + values in the logarithmic scale. A logarithmic scale for d seems to be + a more natural fit. If low values of d yield statistically unstable + results, they should simply be omitted. + - The algorithm tests multiple values for tau_max, which is the maximum + distance that will be calculated. In [mhd_1]_ the authors state that this + is done to test the robustness of the approach. However, taking the + mean of several runs with different tau_max will not produce any more + information than performing one run with the largest tau_max. Instead + it will only introduce a bias towards low values for d. + + References: + .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. -def detrend_data(data, order=1, fit="poly"): - """Removes a trend of given order from the data.""" - # TODO also use this function in dfa - xvals = np.arange(len(data)) - trend = poly_fit(xvals, data, order, fit=fit) - return data - np.polyval(trend, xvals) - - -def dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", - fit_exp="RANSAC", debug_plot=False, debug_data=False, plot_file=None): - """Performs a detrended fluctuation analysis (DFA) on the given data. - - Recommendations for parameter settings by Hardstone et al.: - * nvals should be equally spaced on a logarithmic scale so that each window - scale hase the same weight - * min(nvals) < 4 does not make much sense as fitting a polynomial (even if - it is only of order 1) to 3 or less data points is very prone to errors. - * max(nvals) > len(data) / 10 does not make much sense as we will then have - less than 10 windows to calculate the average fluctuation - * use overlap=True to obtain more windows and therefore better statistics - (at an increased computational cost) - - Explanation of DFA: - Detrended fluctuation analysis, much like the Hurst exponent, is used to - find long-term statistical dependencies in time series. However, while the - Hurst exponent will indicate long-term correlations for any non-stationary - process (i.e. a stochastic process whose probability distribution changes - when shifted in time, such as a random walk whose mean changes over time), - DFA was designed to distinguish between correlations that are purely an - artifact of non-stationarity and those that show inherent long-term - behavior of the studied system. - - Mathematically, the long-term correlations that we are interested in can - be characterized using the autocorrelation function C(s). For a time series - (x_i) with i = 1, ..., N it is defined as follows: - - C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) - - with y_i = x_i - mean(x). If there are no correlations at all, C(s) would - be zero for s > 0. For short-range correlations, C(s) will decline - exponentially, but for long-term correlations the decline follows a power - law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. - - Due to noise and underlying trends, calculating C(s) directly is usually not - feasible. The main idea of DFA is therefore to remove trends up to a given - order from the input data and analyze the remaining fluctuations. Trends - in this sense are smooth signals with monotonous or slowly oscillating - behavior that are caused by external effects and not the dynamical system - under study. - - To get a hold of these trends, the first step is to calculate the "profile" - of our time series as the cumulative sum of deviations from the mean, - effectively integrating our data. This both smoothes out measurement noise - and makes it easier to distinguish the fractal properties of bounded time - series (i.e. time series whose values cannot grow or shrink beyond certain - bounds such as most biological or physical signals) by applying random walk - theory (see [dfa_3]_ and [dfa_4]_). - - y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). - - After that, we split Y(i) into (usually non-overlapping) windows of length - n to calculate local trends at this given scale. The ith window of this - size has the form - - W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] - - The local trends are then removed for each window separately by fitting a - polynomial p_(n,i) to the window W_(n,i) and then calculating - W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). - - This leaves us with the deviations from the trend - the "fluctuations" - - that we are interested in. To quantify them, we take the root mean square - of these fluctuations. It is important to note that we have to sum up all - individual fluctuations across all windows and divide by the total number - of fluctuations here before finally taking the root as last step. Some - implementations apply another root per window, which skews the result. - - The resulting fluctuation F(n) is then only dependent on the window size n, - the scale at which we observe our data. It behaves similar to the - autocorrelation function in that it follows a power-law for long-term - correlations: - - F(n) ~ n^alpha - - Where alpha is the Hurst parameter, which we can obtain from fitting a line - into the plot of log(n) versus log(F(n)) and taking the slope. - - The result can be interpreted as follows: For alpha < 1 the underlying - process is stationary and can be modelled as fractional Gaussian noise with - H = alpha. This means for alpha = 0.5 we have no long-term correlation or - "memory", for 0.5 < alpha < 1 we have positive long-term correlations and - for alpha < 0.5 the long-term correlations are negative. - - For alpha > 1 the underlying process is non-stationary and can be modeled - as fractional Brownian motion with H = alpha - 1. - - References: - .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, - H. E. Stanley, and A. L. Goldberger, “Mosaic organization of - DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. - .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. - Havlin, and A. Bunde, “Detecting long-range correlations with - detrended fluctuation analysis,” Physica A: Statistical - Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, - Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. - .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal - mechanisms in neuronal control: human heartbeat and gait - dynamics in health and disease,” in Self-Organized Biological - Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., - Cambridge University Press, 2000, pp. 66–96. - doi: 10.1017/CBO9780511535338.006. - .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, - “Comparison of detrending methods for fluctuation analysis,” - Physica A: Statistical Mechanics and its Applications, vol. 387, - no. 21, pp. 5080–5090, Sep. 2008, - doi: 10.1016/j.physa.2008.04.023. - .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, - V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, - “Detrended fluctuation analysis: A scale-free view on neuronal - oscillations,” Frontiers in Physiology, vol. 30, 2012. - - Reference code: - .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", - url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html - .. [dfa_b] JE Mietus, "dfa", - url: https://www.physionet.org/physiotools/dfa/dfa-1.htm - .. [dfa_c] "DFA" function in R package "fractal" - - Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - subseries sizes at which to calculate fluctuation - (default: logarithmic_n(4, 0.1*len(data), 1.2)) - overlap (boolean): - if True, the windows W_(n,i) will have a 50% overlap, - otherwise non-overlapping windows will be used - order (int): - (polynomial) order of trend to remove - fit_trend (str): - the fitting method to use for fitting the trends, either 'poly' - for normal least squares polynomial fitting or 'RANSAC' for - RANSAC-fitting which is more robust to outliers but also tends to - lead to unstable results - fit_exp (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - Returns: - float: - the estimate alpha for the Hurst parameter (alpha < 1: stationary - process similar to fractional Gaussian noise with H = alpha, - alpha > 1: non-stationary process similar to fractional Brownian - motion with H = alpha - 1) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, fluctuations, poly)`` where ``nvals`` are the values used for - log(n), ``fluctuations`` are the corresponding log(std(X,n)) and ``poly`` - are the line coefficients (``[slope, intercept]``) - """ - data = np.asarray(data) - total_N = len(data) - if nvals is None: - if total_N > 70: - nvals = logarithmic_n(4, 0.1 * total_N, 1.2) - elif total_N > 10: - nvals = [4, 5, 6, 7, 8, 9] + Reference code: + .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", + url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent + + Args: + data (1d-vector of float): + input data (should be evenly sampled) + qvals (1d-vector of float) + values of q for which H_q should be calculated (default: [1]) + + Kwargs: + max_dists (1d-vector of int): + different values to test for tau_max, the maximum value for the distance + d. The resulting H_q will be a mean of all H_q calculated with tau_max + = max_dists[0], max_dists[1], ... . + detrend (boolean): + if True, a linear trend will be removed from the data before H_q will + be calculated + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + array of float: + array of mH_q for every q given in ``qvals`` where mH_q is the mean of + all H_q calculated for different max distances in max_dists. + array of float: + array of standard deviations sH_q for each mH_q returned + (1d-vector, 2d-vector, 2d-vector): + only present if debug_data is True: debug data of the form + ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, + ``yvals`` are the logarithms of the corresponding height-height- + correlations for each distance (first dimension) and each q + (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are + the line coefficients (``[slope, intercept]``) for each q in the shape + len(qvals) x 2. + """ + # transform to array if necessary + data = np.asarray(data) + if qvals is None: + # actual default parameter would introduce shared list + # see: http://pylint-messages.wikidot.com/messages:w0102 + qvals = [1] + if len(data) < 60: + warnings.warn( + f"H(q) is not reliable for small time series ({len(data)} < 60)", + stacklevel=2, + ) + max_max_dist = np.max(max_dists) + hhcorr = [] + # NOTE: I don't think it's a good idea to use a linear scale for the distance + # values. Our fit is in logarithmic space, so this will place more weight on + # the higher distance. This is not bad per se, but if you think that the + # first values are unreliable, it would be better to skip them alltogether. + for dist in range(1, max_max_dist + 1): + # NOTE: I don't think applying a step size to the input data is reasonable. + # I cannot find any justification for this in the papers and reduces the + # number of points that we can use to make our mean statistically stable. + step_size = dist + stepdata = data[::step_size] + if detrend: + stepdata = detrend_data(stepdata, order=1) + diffs = stepdata[1:] - stepdata[:-1] + hhcorr.append([np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) for q in qvals]) + hhcorr = np.array(hhcorr, dtype=np.float64) + xvals = np.log(np.arange(1, max_max_dist + 1)) + yvals = np.log(hhcorr) + # NOTE: Using several maximum distances seems to be a strange way to + # introduce stability, since it only places emphasis on the lower distance + # ranges and does not introduce any new information. + H = np.array( + [ + poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit)[0] + for qi in range(len(qvals)) + for md in max_dists + ], + dtype=np.float64, + ).reshape(len(qvals), len(max_dists)) + if debug_plot: + polys = [np.array(poly_fit(xvals, yvals[:, qi], 1)) / qvals[qi] for qi in range(len(qvals))] + plot_reg_multiple( + [xvals] * len(qvals), + [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], + polys, + x_label="log(x)", + y_label="$\\log(c_q(x)) / q$", + data_labels=["q = %d" % q for q in qvals], + reg_labels=[f"reg. line (H = {h:.3f})" for h in H[:, -1] / qvals], + fname=plot_file, + ) + mH = np.mean(H, axis=1) / qvals + sH = np.std(H, axis=1) / qvals + if debug_data: + return [mH, sH, (xvals, yvals, polys)] + return [mH, sH] + + +def corr_dim( + data, + emb_dim, + lag=1, + rvals=None, + dist=rowwise_euclidean, + fit="RANSAC", + debug_plot=False, + debug_data=False, + plot_file=None, +): + """Calculates the correlation dimension with the Grassberger-Procaccia algorithm. + + Explanation of correlation dimension: + The correlation dimension is a characteristic measure that can be used + to describe the geometry of chaotic attractors. It is defined using the + correlation sum C(r) which is the fraction of pairs of points X_i in the + phase space whose distance is smaller than r. + + If the relation between C(r) and r can be described by the power law + + C(r) ~ r^D + + then D is called the correlation dimension of the system. + + In a d-dimensional system, the maximum value for D is d. This value is + obtained for systems that expand uniformly in each dimension with time. + The lowest possible value is 0 for a system with constant C(r) (i.e. a + system that visits just one point in the phase space). Generally if D is + lower than d and the system has an attractor, this attractor is called + "strange" and D is a measure of this "strangeness". + + Explanation of the algorithm: + The Grassberger-Procaccia algorithm calculates C(r) for a range of + different r and then fits a straight line into the plot of log(C(r)) + versus log(r). + + This version of the algorithm is created for one-dimensional (scalar) time + series. Therefore, before calculating C(r), a delay embedding of the time + series is performed to yield emb_dim dimensional vectors + Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing + a higher value for emb_dim allows to reconstruct higher dimensional dynamics + and avoids "systematic errors due to corrections to scaling". Choosing a + higher value for lag allows to avoid overestimating correlation because + X_i ~= X_i+1, but it should also not be set too high to not underestimate + correlation due to exponential divergence of trajectories in chaotic systems. + + References: + .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange + attractors,” Physical review letters, vol. 50, no. 5, p. 346, + 1983. + .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of + strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” + Scholarpedia, vol. 2, no. 5, p. 3043. + urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm + + Reference Code: + .. [cd_a] "corrDim" function in R package "fractal", + url: https://cran.r-project.org/web/packages/fractal/fractal.pdf + .. [cd_b] Peng Yuehua, "Correlation dimension", + url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension + + Args: + data (array-like of float): + time series of data points + emb_dim (int): + embedding dimension + Kwargs: + rvals (iterable of float): + list of values for to use for r + (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) + dist (function (2d-array, 1d-array) -> 1d-array): + row-wise difference function + fit (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + + Returns: + float: + correlation dimension as slope of the line fitted to log(r) vs log(C(r)) + (1d-vector, 1d-vector, list): + only present if debug_data is True: debug data of the form + ``(rvals, csums, poly)`` where ``rvals`` are the values used for log(r), + ``csums`` are the corresponding log(C(r)) and ``poly`` are the line + coefficients (``[slope, intercept]``) + """ + # TODO determine lag in units of time instead of number of datapoints + data = np.asarray(data) + + # TODO what are good values for r? + # TODO do this for multiple values of emb_dim? + if rvals is None: + sd = np.std(data, ddof=1) + rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) + orbit = delay_embedding(data, emb_dim, lag=lag) + n = len(orbit) + dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64) + for i in range(len(orbit)): + # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1 + # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches + # however, since both [cd_2] and [cd_3] specify to only compare i with j != i + # or j > i respectively, it is safe to assume that this was an oversight in + # [cd_1] + d = dist(orbit[i + 1 :], orbit[i]) + dists[i + 1 :, i] = d # fill column i + dists[i, i + 1 :] = d # fill row i + csums = [] + for r in rvals: + # NOTE: The [cd_1] and [cd_2] both use the factor 1/N^2 here. + # However, since we only use these values to fit a line in a log-log plot + # any multiplicative constant doesn't change the result since it will + # only result in an offset on the y-axis. Also, [cd_3] has a point here + # in that if we exclude self-matches in the numerator, it makes sense to + # also exclude self-matches from the denominator. + s = 1.0 / (n * (n - 1)) * np.sum(dists <= r) + csums.append(s) + csums = np.array(csums) + # filter zeros from csums + nonzero = np.where(csums != 0) + rvals = np.array(rvals)[nonzero] + csums = csums[nonzero] + if len(csums) == 0: + # all sums are zero => we cannot fit a line + poly = [np.nan, np.nan] else: - nvals = [total_N-2, total_N-1] - msg = "choosing nvals = {} , DFA with less than ten data points is " \ - "extremely unreliable" - warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) - if len(nvals) < 2: - msg = "at least two nvals are needed" - raise ValueError(msg) - if np.min(nvals) < 2: - msg = "nvals must be at least two" - raise ValueError(msg) - if np.max(nvals) >= total_N: - msg = "nvals cannot be larger than the input size" - raise ValueError(msg) - # create the signal profile - # (cumulative sum of deviations from the mean => "walk") - walk = np.cumsum(data - np.mean(data)) - fluctuations = [] - for n in nvals: - assert n >= 2 - # subdivide data into chunks of size n - if overlap: - # step size n/2 instead of n - d = np.array([walk[i:i + n] for i in range(0, len(walk) - n, n // 2)]) + poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) + if debug_plot: + plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file) + if debug_data: + return (poly[0], (np.log(rvals), np.log(csums), poly)) + return poly[0] + + +def detrend_data(data, order=1, fit="poly"): + """Removes a trend of given order from the data.""" + # TODO also use this function in dfa + xvals = np.arange(len(data)) + trend = poly_fit(xvals, data, order, fit=fit) + return data - np.polyval(trend, xvals) + + +def dfa( + data, + nvals=None, + overlap=True, + order=1, + fit_trend="poly", + fit_exp="RANSAC", + debug_plot=False, + debug_data=False, + plot_file=None, +): + """Performs a detrended fluctuation analysis (DFA) on the given data. + + Recommendations for parameter settings by Hardstone et al.: + * nvals should be equally spaced on a logarithmic scale so that each window + scale hase the same weight + * min(nvals) < 4 does not make much sense as fitting a polynomial (even if + it is only of order 1) to 3 or less data points is very prone to errors. + * max(nvals) > len(data) / 10 does not make much sense as we will then have + less than 10 windows to calculate the average fluctuation + * use overlap=True to obtain more windows and therefore better statistics + (at an increased computational cost) + + Explanation of DFA: + Detrended fluctuation analysis, much like the Hurst exponent, is used to + find long-term statistical dependencies in time series. However, while the + Hurst exponent will indicate long-term correlations for any non-stationary + process (i.e. a stochastic process whose probability distribution changes + when shifted in time, such as a random walk whose mean changes over time), + DFA was designed to distinguish between correlations that are purely an + artifact of non-stationarity and those that show inherent long-term + behavior of the studied system. + + Mathematically, the long-term correlations that we are interested in can + be characterized using the autocorrelation function C(s). For a time series + (x_i) with i = 1, ..., N it is defined as follows: + + C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) + + with y_i = x_i - mean(x). If there are no correlations at all, C(s) would + be zero for s > 0. For short-range correlations, C(s) will decline + exponentially, but for long-term correlations the decline follows a power + law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. + + Due to noise and underlying trends, calculating C(s) directly is usually not + feasible. The main idea of DFA is therefore to remove trends up to a given + order from the input data and analyze the remaining fluctuations. Trends + in this sense are smooth signals with monotonous or slowly oscillating + behavior that are caused by external effects and not the dynamical system + under study. + + To get a hold of these trends, the first step is to calculate the "profile" + of our time series as the cumulative sum of deviations from the mean, + effectively integrating our data. This both smoothes out measurement noise + and makes it easier to distinguish the fractal properties of bounded time + series (i.e. time series whose values cannot grow or shrink beyond certain + bounds such as most biological or physical signals) by applying random walk + theory (see [dfa_3]_ and [dfa_4]_). + + y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). + + After that, we split Y(i) into (usually non-overlapping) windows of length + n to calculate local trends at this given scale. The ith window of this + size has the form + + W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] + + The local trends are then removed for each window separately by fitting a + polynomial p_(n,i) to the window W_(n,i) and then calculating + W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). + + This leaves us with the deviations from the trend - the "fluctuations" - + that we are interested in. To quantify them, we take the root mean square + of these fluctuations. It is important to note that we have to sum up all + individual fluctuations across all windows and divide by the total number + of fluctuations here before finally taking the root as last step. Some + implementations apply another root per window, which skews the result. + + The resulting fluctuation F(n) is then only dependent on the window size n, + the scale at which we observe our data. It behaves similar to the + autocorrelation function in that it follows a power-law for long-term + correlations: + + F(n) ~ n^alpha + + Where alpha is the Hurst parameter, which we can obtain from fitting a line + into the plot of log(n) versus log(F(n)) and taking the slope. + + The result can be interpreted as follows: For alpha < 1 the underlying + process is stationary and can be modelled as fractional Gaussian noise with + H = alpha. This means for alpha = 0.5 we have no long-term correlation or + "memory", for 0.5 < alpha < 1 we have positive long-term correlations and + for alpha < 0.5 the long-term correlations are negative. + + For alpha > 1 the underlying process is non-stationary and can be modeled + as fractional Brownian motion with H = alpha - 1. + + References: + .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, + H. E. Stanley, and A. L. Goldberger, “Mosaic organization of + DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. + .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. + Havlin, and A. Bunde, “Detecting long-range correlations with + detrended fluctuation analysis,” Physica A: Statistical + Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, + Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. + .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal + mechanisms in neuronal control: human heartbeat and gait + dynamics in health and disease,” in Self-Organized Biological + Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., + Cambridge University Press, 2000, pp. 66–96. + doi: 10.1017/CBO9780511535338.006. + .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, + “Comparison of detrending methods for fluctuation analysis,” + Physica A: Statistical Mechanics and its Applications, vol. 387, + no. 21, pp. 5080–5090, Sep. 2008, + doi: 10.1016/j.physa.2008.04.023. + .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, + V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, + “Detrended fluctuation analysis: A scale-free view on neuronal + oscillations,” Frontiers in Physiology, vol. 30, 2012. + + Reference code: + .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", + url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html + .. [dfa_b] JE Mietus, "dfa", + url: https://www.physionet.org/physiotools/dfa/dfa-1.htm + .. [dfa_c] "DFA" function in R package "fractal" + + Args: + data (array-like of float): + time series + Kwargs: + nvals (iterable of int): + subseries sizes at which to calculate fluctuation + (default: logarithmic_n(4, 0.1*len(data), 1.2)) + overlap (boolean): + if True, the windows W_(n,i) will have a 50% overlap, + otherwise non-overlapping windows will be used + order (int): + (polynomial) order of trend to remove + fit_trend (str): + the fitting method to use for fitting the trends, either 'poly' + for normal least squares polynomial fitting or 'RANSAC' for + RANSAC-fitting which is more robust to outliers but also tends to + lead to unstable results + fit_exp (str): + the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot (boolean): + if True, a simple plot of the final line-fitting step will be shown + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + Returns: + float: + the estimate alpha for the Hurst parameter (alpha < 1: stationary + process similar to fractional Gaussian noise with H = alpha, + alpha > 1: non-stationary process similar to fractional Brownian + motion with H = alpha - 1) + (1d-vector, 1d-vector, list): + only present if debug_data is True: debug data of the form + ``(nvals, fluctuations, poly)`` where ``nvals`` are the values used for + log(n), ``fluctuations`` are the corresponding log(std(X,n)) and ``poly`` + are the line coefficients (``[slope, intercept]``) + """ + data = np.asarray(data) + total_N = len(data) + if nvals is None: + if total_N > 70: + nvals = logarithmic_n(4, 0.1 * total_N, 1.2) + elif total_N > 10: + nvals = [4, 5, 6, 7, 8, 9] + else: + nvals = [total_N - 2, total_N - 1] + msg = "choosing nvals = {} , DFA with less than ten data points is extremely unreliable" + warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) + if len(nvals) < 2: + msg = "at least two nvals are needed" + raise ValueError(msg) + if np.min(nvals) < 2: + msg = "nvals must be at least two" + raise ValueError(msg) + if np.max(nvals) >= total_N: + msg = "nvals cannot be larger than the input size" + raise ValueError(msg) + # create the signal profile + # (cumulative sum of deviations from the mean => "walk") + walk = np.cumsum(data - np.mean(data)) + fluctuations = [] + for n in nvals: + assert n >= 2 + # subdivide data into chunks of size n + if overlap: + # step size n/2 instead of n + d = np.array([walk[i : i + n] for i in range(0, len(walk) - n, n // 2)]) + else: + # non-overlapping windows => we can simply do a reshape + d = walk[: total_N - (total_N % n)] + d = d.reshape((total_N // n, n)) + # calculate local trends as polynomes + x = np.arange(n) + tpoly = [poly_fit(x, d[i], order, fit=fit_trend) for i in range(len(d))] + tpoly = np.array(tpoly) + trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))]) + # calculate mean-square differences for each walk in d around trend + flucs = np.sum((d - trend) ** 2, axis=1) / n + # take another mean across all walks and finally take the square root of that + # NOTE: To map this to the formula in Peng1995, observe that this simplifies + # to np.sqrt(np.sum((d - trend) ** 2) / total_N) if we have non-overlapping + # windows and the last window matches the end of the data perfectly. + f_n = np.sqrt(np.sum(flucs) / len(flucs)) + fluctuations.append(f_n) + fluctuations = np.array(fluctuations) + # filter zeros from fluctuations + nonzero = np.where(fluctuations != 0) + nvals = np.array(nvals)[nonzero] + fluctuations = fluctuations[nonzero] + if len(fluctuations) == 0: + # all fluctuations are zero => we cannot fit a line + poly = [np.nan, np.nan] else: - # non-overlapping windows => we can simply do a reshape - d = walk[:total_N - (total_N % n)] - d = d.reshape((total_N // n, n)) - # calculate local trends as polynomes - x = np.arange(n) - tpoly = [poly_fit(x, d[i], order, fit=fit_trend) - for i in range(len(d))] - tpoly = np.array(tpoly) - trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))]) - # calculate mean-square differences for each walk in d around trend - flucs = np.sum((d - trend) ** 2, axis=1) / n - # take another mean across all walks and finally take the square root of that - # NOTE: To map this to the formula in Peng1995, observe that this simplifies - # to np.sqrt(np.sum((d - trend) ** 2) / total_N) if we have non-overlapping - # windows and the last window matches the end of the data perfectly. - f_n = np.sqrt(np.sum(flucs) / len(flucs)) - fluctuations.append(f_n) - fluctuations = np.array(fluctuations) - # filter zeros from fluctuations - nonzero = np.where(fluctuations != 0) - nvals = np.array(nvals)[nonzero] - fluctuations = fluctuations[nonzero] - if len(fluctuations) == 0: - # all fluctuations are zero => we cannot fit a line - poly = [np.nan, np.nan] - else: - poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, - fit=fit_exp) - if debug_plot: - plot_reg(np.log(nvals), np.log(fluctuations), poly, "log(n)", "std(X,n)", - fname=plot_file) - if debug_data: - return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) - return poly[0] + poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, fit=fit_exp) + if debug_plot: + plot_reg(np.log(nvals), np.log(fluctuations), poly, "log(n)", "std(X,n)", fname=plot_file) + if debug_data: + return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) + return poly[0] diff --git a/nolds/test_measures.py b/nolds/test_measures.py index a2123d4..b022a9b 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -15,643 +15,709 @@ # TODO split up tests into smaller units => one hypothesis = one test try: - from scipy.stats import levy_stable - SCIPY_AVAILABLE = True + from scipy.stats import levy_stable + + SCIPY_AVAILABLE = True except ImportError: - SCIPY_AVAILABLE = False + SCIPY_AVAILABLE = False class TestNoldsHelperFunctions(unittest.TestCase): - """Tests for internal helper functions that are not part of the public API.""" - def assert_array_equals(self, expected, actual, print_arrays=False) -> None: - if print_arrays: - pass - assert np.all(actual == expected) - - def test_delay_embed_lag2(self) -> None: - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=2) - expected = np.array([ - [0, 2, 4, 6], - [1, 3, 5, 7], - [2, 4, 6, 8], - [3, 5, 7, 9], - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed(self) -> None: - data = np.arange(6, dtype="float32") - embedded = nolds.delay_embedding(data, 4) - expected = np.array([ - [0, 1, 2, 3], - [1, 2, 3, 4], - [2, 3, 4, 5], - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed_lag3(self) -> None: - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=3) - expected = np.array([ - [0, 3, 6, 9], - ], dtype="float32") - self.assert_array_equals(expected, embedded) - - def test_delay_embed_empty(self) -> None: - data = np.arange(10, dtype="float32") - try: - embedded = nolds.delay_embedding(data, 11) - msg = "embedding array of size 10 with embedding dimension 11 " \ - "should fail, got {} instead" - self.fail(msg.format(embedded)) - except ValueError: - pass - data = np.arange(10, dtype="float32") - try: - embedded = nolds.delay_embedding(data, 4, lag=4) - msg = "embedding array of size 10 with embedding dimension 4 and " \ - "lag 4 should fail, got {} instead" - self.fail(msg.format(embedded)) - except ValueError: - pass + """Tests for internal helper functions that are not part of the public API.""" + + def assert_array_equals(self, expected, actual, print_arrays=False) -> None: + if print_arrays: + pass + assert np.all(actual == expected) + + def test_delay_embed_lag2(self) -> None: + data = np.arange(10, dtype="float32") + embedded = nolds.delay_embedding(data, 4, lag=2) + expected = np.array( + [ + [0, 2, 4, 6], + [1, 3, 5, 7], + [2, 4, 6, 8], + [3, 5, 7, 9], + ], + dtype="float32", + ) + self.assert_array_equals(expected, embedded) + + def test_delay_embed(self) -> None: + data = np.arange(6, dtype="float32") + embedded = nolds.delay_embedding(data, 4) + expected = np.array( + [ + [0, 1, 2, 3], + [1, 2, 3, 4], + [2, 3, 4, 5], + ], + dtype="float32", + ) + self.assert_array_equals(expected, embedded) + + def test_delay_embed_lag3(self) -> None: + data = np.arange(10, dtype="float32") + embedded = nolds.delay_embedding(data, 4, lag=3) + expected = np.array( + [ + [0, 3, 6, 9], + ], + dtype="float32", + ) + self.assert_array_equals(expected, embedded) + + def test_delay_embed_empty(self) -> None: + data = np.arange(10, dtype="float32") + try: + embedded = nolds.delay_embedding(data, 11) + msg = ( + "embedding array of size 10 with embedding dimension 11 should fail, got {} instead" + ) + self.fail(msg.format(embedded)) + except ValueError: + pass + data = np.arange(10, dtype="float32") + try: + embedded = nolds.delay_embedding(data, 4, lag=4) + msg = ( + "embedding array of size 10 with embedding dimension 4 and " + "lag 4 should fail, got {} instead" + ) + self.fail(msg.format(embedded)) + except ValueError: + pass class TestNoldsUtility(unittest.TestCase): - """Tests for small utility functions that are part of the public API.""" - def test_binary_n(self) -> None: - x = nolds.binary_n(1000, min_n=50) - self.assertSequenceEqual(x, [500, 250, 125, 62]) + """Tests for small utility functions that are part of the public API.""" - def test_binary_n_empty(self) -> None: - x = nolds.binary_n(50, min_n=50) - self.assertSequenceEqual(x, []) + def test_binary_n(self) -> None: + x = nolds.binary_n(1000, min_n=50) + self.assertSequenceEqual(x, [500, 250, 125, 62]) - def test_logarithmic_n(self) -> None: - x = nolds.logarithmic_n(4, 11, 1.51) - self.assertSequenceEqual(x, [4, 6, 9]) + def test_binary_n_empty(self) -> None: + x = nolds.binary_n(50, min_n=50) + self.assertSequenceEqual(x, []) - def test_logarithmic_r(self) -> None: - x = nolds.logarithmic_r(4, 10, 1.51) - self.assertSequenceEqual(x, [4, 6.04, 9.1204]) + def test_logarithmic_n(self) -> None: + x = nolds.logarithmic_n(4, 11, 1.51) + self.assertSequenceEqual(x, [4, 6, 9]) + def test_logarithmic_r(self) -> None: + x = nolds.logarithmic_r(4, 10, 1.51) + self.assertSequenceEqual(x, [4, 6.04, 9.1204]) -class TestNoldsLyap(unittest.TestCase): - """Tests for lyap_e and lyap_r.""" - def test_lyap_logistic(self) -> None: - rvals = [2.5, 3.4, 3.7, 4.0] - sign = [-1, -1, 1, 1] - x0 = 0.1 - - def logistic(x, r): - return r * x * (1 - x) - - for r, s in zip(rvals, sign): - log = [] - x = x0 - for _ in range(100): - x = logistic(x, r) - log.append(x) - log = np.array(log, dtype="float32") - le = np.max(nolds.lyap_e(log, emb_dim=6, matrix_dim=2)) - lr = nolds.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) - assert s == int(np.sign(le)), f"r = {r}" - assert s == int(np.sign(lr)), f"r = {r}" - - def test_lyap_lorenz(self) -> None: - """Test hypothesis: Both lyap_r and lyap_e can reconstruct the largest Lyapunov exponent of the Lorenz system. - - The parameters for generating the Lorenz system were chosen to be as close as - possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) - and . - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and - similarities in the analysis of Lorenz, Chen, and Lu systems,” - Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, - doi: 10.1016/j.amc.2014.12.132. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - lyap_r_args = {"min_tsep": 10, "emb_dim": 5, "tau": 0.01, "lag": 5, "trajectory_len": 28, "fit_offset": 8, "fit": "poly"} - lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) - lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) - lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) - lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": 0.01, "min_nb": 8} - lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) - lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) - lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) - self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) - self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) - self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) - assert lyap_ex[0] > 1.5 - assert lyap_ey[0] > 1.5 - assert lyap_ez[0] > 1.5 - - def test_lyap_fbm(self) -> None: - data = datasets.fbm(1000, H=0.3) - le = nolds.lyap_e(data, emb_dim=7, matrix_dim=3) - assert np.max(le) > 0 - - def test_lyap_r_limits(self) -> None: - """Tests if minimal input size is correctly calculated.""" - np.random.seed(0) - for i in range(10): - kwargs = { - "emb_dim": np.random.randint(1,10), - "lag": np.random.randint(1,6), - "min_tsep": np.random.randint(0,5), - "trajectory_len": np.random.randint(2,10), - } - min_len = nolds.lyap_r_len(**kwargs) - for i in reversed(range(max(1,min_len-5),min_len+5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_r(data, fit="poly", **kwargs) - msg = "{} data points should be required for kwargs {}, but " \ - "{} where enough" - self.fail(msg.format( - min_len, - kwargs, - i, - )) - except ValueError: - #print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but " \ - " {} where too few" - try: - assert np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), msg.format(min_len, kwargs, i) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e), - ) - def test_lyap_e_limits(self) -> None: - """Tests if minimal input size is correctly calculated.""" - np.random.seed(1) - for i in range(10): - kwargs = { - "matrix_dim": np.random.randint(2,10), - "min_tsep": np.random.randint(0,10), - "min_nb": np.random.randint(2,15), - } - kwargs["emb_dim"] = np.random.randint(1,4) \ - * (kwargs["matrix_dim"] - 1) + 1 - min_len = nolds.lyap_e_len(**kwargs) - for i in reversed(range(max(1,min_len-5),min_len+5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_e(data, **kwargs) - msg = "{} data points should be required for kwargs {}, but " \ - "{} where enough" - self.fail(msg.format( - min_len, - kwargs, - i, - )) - except ValueError: - #print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but " \ - " {} where too few" - try: - assert np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), msg.format(min_len, kwargs, i) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: "+str(e), - ) +class TestNoldsLyap(unittest.TestCase): + """Tests for lyap_e and lyap_r.""" + + def test_lyap_logistic(self) -> None: + rvals = [2.5, 3.4, 3.7, 4.0] + sign = [-1, -1, 1, 1] + x0 = 0.1 + + def logistic(x, r): + return r * x * (1 - x) + + for r, s in zip(rvals, sign): + log = [] + x = x0 + for _ in range(100): + x = logistic(x, r) + log.append(x) + log = np.array(log, dtype="float32") + le = np.max(nolds.lyap_e(log, emb_dim=6, matrix_dim=2)) + lr = nolds.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) + assert s == int(np.sign(le)), f"r = {r}" + assert s == int(np.sign(lr)), f"r = {r}" + + def test_lyap_lorenz(self) -> None: + """Test hypothesis: Both lyap_r and lyap_e can reconstruct the largest Lyapunov exponent of the Lorenz system. + + The parameters for generating the Lorenz system were chosen to be as close as + possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) + and . + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and + similarities in the analysis of Lorenz, Chen, and Lu systems,” + Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, + doi: 10.1016/j.amc.2014.12.132. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + lyap_r_args = { + "min_tsep": 10, + "emb_dim": 5, + "tau": 0.01, + "lag": 5, + "trajectory_len": 28, + "fit_offset": 8, + "fit": "poly", + } + lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) + lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) + lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) + lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": 0.01, "min_nb": 8} + lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) + lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) + lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) + self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) + self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) + self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) + assert lyap_ex[0] > 1.5 + assert lyap_ey[0] > 1.5 + assert lyap_ez[0] > 1.5 + + def test_lyap_fbm(self) -> None: + data = datasets.fbm(1000, H=0.3) + le = nolds.lyap_e(data, emb_dim=7, matrix_dim=3) + assert np.max(le) > 0 + + def test_lyap_r_limits(self) -> None: + """Tests if minimal input size is correctly calculated.""" + np.random.seed(0) + for i in range(10): + kwargs = { + "emb_dim": np.random.randint(1, 10), + "lag": np.random.randint(1, 6), + "min_tsep": np.random.randint(0, 5), + "trajectory_len": np.random.randint(2, 10), + } + min_len = nolds.lyap_r_len(**kwargs) + for i in reversed(range(max(1, min_len - 5), min_len + 5)): + data = np.random.random(i) + if i < min_len: + ## too few data points => execution should fail + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + nolds.lyap_r(data, fit="poly", **kwargs) + msg = "{} data points should be required for kwargs {}, but {} where enough" + self.fail( + msg.format( + min_len, + kwargs, + i, + ) + ) + except ValueError: + # print(e) + pass + else: + ## enough data points => execution should succeed + msg = "{} data points should be enough for kwargs {}, but {} where too few" + try: + assert np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), ( + msg.format(min_len, kwargs, i) + ) + except ValueError as e: + self.fail( + msg.format(min_len, kwargs, i) + ", original error: " + str(e), + ) + + def test_lyap_e_limits(self) -> None: + """Tests if minimal input size is correctly calculated.""" + np.random.seed(1) + for i in range(10): + kwargs = { + "matrix_dim": np.random.randint(2, 10), + "min_tsep": np.random.randint(0, 10), + "min_nb": np.random.randint(2, 15), + } + kwargs["emb_dim"] = np.random.randint(1, 4) * (kwargs["matrix_dim"] - 1) + 1 + min_len = nolds.lyap_e_len(**kwargs) + for i in reversed(range(max(1, min_len - 5), min_len + 5)): + data = np.random.random(i) + if i < min_len: + ## too few data points => execution should fail + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + nolds.lyap_e(data, **kwargs) + msg = "{} data points should be required for kwargs {}, but {} where enough" + self.fail( + msg.format( + min_len, + kwargs, + i, + ) + ) + except ValueError: + # print(e) + pass + else: + ## enough data points => execution should succeed + msg = "{} data points should be enough for kwargs {}, but {} where too few" + try: + assert np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), msg.format( + min_len, kwargs, i + ) + except ValueError as e: + self.fail( + msg.format(min_len, kwargs, i) + ", original error: " + str(e), + ) class TestNoldsHurst(unittest.TestCase): - """Tests for hurst_rs.""" - def test_hurst_basic(self) -> None: - np.random.seed(2) - # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() - for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = nolds.hurst_rs(seq_neg) - #print("h_neg = %.3f" % h_neg) - # expected h is around 0 - assert h_neg < 0.3 - - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = nolds.hurst_rs(x) - #print("h_rand = %.3f" % h_rand) - # expected h is around 0.5 - self.assertAlmostEqual(h_rand, 0.5, delta=0.1) - - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = nolds.hurst_rs(walk) - #print("h_walk = %.3f" % h_walk) - # expected h is around 1.0 - assert h_walk > 0.9 - - def test_hurst_pracma(self) -> None: - """Tests for hurst_rs using the same tests as in the R-package pracma.""" - np.random.seed(3) - # This test reproduces the results presented by Ian L. Kaplan on - # bearcave.com - h72 = nolds.hurst_rs( - datasets.brown72, fit="poly", corrected=False, unbiased=False, - nvals=2**np.arange(3,11)) - #print("h72 = %.3f" % h72) - self.assertAlmostEqual(h72, 0.72, delta=0.01) - - xgn = np.random.normal(size=10000) - hgn = nolds.hurst_rs(xgn, fit="poly") - #print("hgn = %.3f" % hgn) - self.assertAlmostEqual(hgn, 0.5, delta=0.1) - - xlm = np.fromiter(datasets.logistic_map(0.1,1024),dtype="float32") - hlm = nolds.hurst_rs(xlm, fit="poly", nvals=2**np.arange(3,11)) - #print("hlm = %.3f" % hlm) - self.assertAlmostEqual(hlm, 0.43, delta=0.05) - - def test_hurst_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series - Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, - 2009, doi: 10.1007/s11207-009-9467-x. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} - hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) - hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) - hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) - self.assertAlmostEqual(0.9, hx, delta=0.05) - self.assertAlmostEqual(0.9, hy, delta=0.05) - self.assertAlmostEqual(0.9, hz, delta=0.05) + """Tests for hurst_rs.""" + + def test_hurst_basic(self) -> None: + np.random.seed(2) + # strong negative correlation between successive elements + seq_neg = [] + x = np.random.random() + for _ in range(10000): + x = -x + np.random.random() - 0.5 + seq_neg.append(x) + h_neg = nolds.hurst_rs(seq_neg) + # print("h_neg = %.3f" % h_neg) + # expected h is around 0 + assert h_neg < 0.3 + + # no correlation, just random noise + x = np.random.randn(10000) + h_rand = nolds.hurst_rs(x) + # print("h_rand = %.3f" % h_rand) + # expected h is around 0.5 + self.assertAlmostEqual(h_rand, 0.5, delta=0.1) + + # cumulative sum has strong positive correlation between + # elements + walk = np.cumsum(x) + h_walk = nolds.hurst_rs(walk) + # print("h_walk = %.3f" % h_walk) + # expected h is around 1.0 + assert h_walk > 0.9 + + def test_hurst_pracma(self) -> None: + """Tests for hurst_rs using the same tests as in the R-package pracma.""" + np.random.seed(3) + # This test reproduces the results presented by Ian L. Kaplan on + # bearcave.com + h72 = nolds.hurst_rs( + datasets.brown72, + fit="poly", + corrected=False, + unbiased=False, + nvals=2 ** np.arange(3, 11), + ) + # print("h72 = %.3f" % h72) + self.assertAlmostEqual(h72, 0.72, delta=0.01) + + xgn = np.random.normal(size=10000) + hgn = nolds.hurst_rs(xgn, fit="poly") + # print("hgn = %.3f" % hgn) + self.assertAlmostEqual(hgn, 0.5, delta=0.1) + + xlm = np.fromiter(datasets.logistic_map(0.1, 1024), dtype="float32") + hlm = nolds.hurst_rs(xlm, fit="poly", nvals=2 ** np.arange(3, 11)) + # print("hlm = %.3f" % hlm) + self.assertAlmostEqual(hlm, 0.43, delta=0.05) + + def test_hurst_lorenz(self) -> None: + """Test hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series + Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, + 2009, doi: 10.1007/s11207-009-9467-x. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} + hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) + hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) + hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) + self.assertAlmostEqual(0.9, hx, delta=0.05) + self.assertAlmostEqual(0.9, hy, delta=0.05) + self.assertAlmostEqual(0.9, hz, delta=0.05) -class TestNoldsDFA(unittest.TestCase): - """Tests for dfa.""" - def test_dfa_base(self) -> None: - np.random.seed(4) - # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() - for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = nolds.dfa(seq_neg) - # expected h is around 0 - assert h_neg < 0.3 - - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = nolds.dfa(x) - # expected h is around 0.5 - assert h_rand < 0.7 - assert h_rand > 0.3 - - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = nolds.dfa(walk) - # expected h is around 1.0 - assert h_walk > 0.7 - - def test_dfa_fbm(self) -> None: - hs = [0.3, 0.5, 0.7] - for h in hs: - data = datasets.fbm(1000, H=h) - he = nolds.dfa(data) - self.assertAlmostEqual(he, h + 1, delta=0.15) - - def test_dfa_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the Hurst parameter of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, - and M. Dietz, “A Multivariate Method for Dynamic System Analysis: - Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” - Topics in Cognitive Science, p. tops.12688, Sep. 2023, - doi: 10.1111/tops.12688. - """ - data = datasets.lorenz_euler(120000, 10, 28, 8/3.0, start=[0.1,0.1,0.1], dt=0.002)[20000:] - nvals = nolds.logarithmic_n(200, len(data)/8, 2**0.2) - dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} - dx = nolds.dfa(data[:, 0], **dfa_args) - dy = nolds.dfa(data[:, 1], **dfa_args) - dz = nolds.dfa(data[:, 2], **dfa_args) - self.assertAlmostEqual(1.008, dx, delta=0.04) - self.assertAlmostEqual(0.926, dy, delta=0.032) - self.assertAlmostEqual(0.650, dz, delta=0.44) - - def test_dfa_agreement_with_physionet(self) -> None: - """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" - lorenz_x, physionet_points = datasets.load_lorenz_physionet() - nvals = [round(x) for x in 10 ** physionet_points[:,0]] - _, (_, nolds_rs, _) = nolds.dfa(lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True) - nolds_rs_log10 = nolds_rs / np.log(10) - # assert that sum of squared errors is less than 1e-9 - assert sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) < 1e-09 - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") - def test_dfa_levy(self) -> None: - """Test hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. - - Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. - """ - alpha = 1.5 - x = levy_stable.rvs(alpha=alpha, beta=0, size=10000) - h = nolds.dfa(x, fit_exp="poly") - self.assertAlmostEqual(0.5, h, delta=0.1) +class TestNoldsDFA(unittest.TestCase): + """Tests for dfa.""" + + def test_dfa_base(self) -> None: + np.random.seed(4) + # strong negative correlation between successive elements + seq_neg = [] + x = np.random.random() + for _ in range(10000): + x = -x + np.random.random() - 0.5 + seq_neg.append(x) + h_neg = nolds.dfa(seq_neg) + # expected h is around 0 + assert h_neg < 0.3 + + # no correlation, just random noise + x = np.random.randn(10000) + h_rand = nolds.dfa(x) + # expected h is around 0.5 + assert h_rand < 0.7 + assert h_rand > 0.3 + + # cumulative sum has strong positive correlation between + # elements + walk = np.cumsum(x) + h_walk = nolds.dfa(walk) + # expected h is around 1.0 + assert h_walk > 0.7 + + def test_dfa_fbm(self) -> None: + hs = [0.3, 0.5, 0.7] + for h in hs: + data = datasets.fbm(1000, H=h) + he = nolds.dfa(data) + self.assertAlmostEqual(he, h + 1, delta=0.15) + + def test_dfa_lorenz(self) -> None: + """Test hypothesis: We get correct values for estimating the Hurst parameter of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, + and M. Dietz, “A Multivariate Method for Dynamic System Analysis: + Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” + Topics in Cognitive Science, p. tops.12688, Sep. 2023, + doi: 10.1111/tops.12688. + """ + data = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ + 20000: + ] + nvals = nolds.logarithmic_n(200, len(data) / 8, 2**0.2) + dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} + dx = nolds.dfa(data[:, 0], **dfa_args) + dy = nolds.dfa(data[:, 1], **dfa_args) + dz = nolds.dfa(data[:, 2], **dfa_args) + self.assertAlmostEqual(1.008, dx, delta=0.04) + self.assertAlmostEqual(0.926, dy, delta=0.032) + self.assertAlmostEqual(0.650, dz, delta=0.44) + + def test_dfa_agreement_with_physionet(self) -> None: + """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" + lorenz_x, physionet_points = datasets.load_lorenz_physionet() + nvals = [round(x) for x in 10 ** physionet_points[:, 0]] + _, (_, nolds_rs, _) = nolds.dfa( + lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True + ) + nolds_rs_log10 = nolds_rs / np.log(10) + # assert that sum of squared errors is less than 1e-9 + assert sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) < 1e-09 + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") + def test_dfa_levy(self) -> None: + """Test hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. + + Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. + """ + alpha = 1.5 + x = levy_stable.rvs(alpha=alpha, beta=0, size=10000) + h = nolds.dfa(x, fit_exp="poly") + self.assertAlmostEqual(0.5, h, delta=0.1) class TestNoldsCorrDim(unittest.TestCase): - """Tests for corr_dim.""" - def test_corr_dim(self) -> None: - np.random.seed(5) - n = 1000 - data = np.arange(n) - cd = nolds.corr_dim(data, 4) - self.assertAlmostEqual(cd, 1, delta=0.05) - # TODO what is the prescribed correlation dimension for random data? - data = np.random.random(n) - cd = nolds.corr_dim(data, 4, fit="poly") - self.assertAlmostEqual(cd, 0.5, delta=0.15) - # TODO test example for cd > 1 - - def test_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the correlation dimension of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Grassberger and Procaccia (1983) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. The settings of n, discard, - lag, emb_dim, and rvals were determined experimentally to find the smallest - dataset that yields the results reported. - - .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness - of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - """ - discard = 5000 - n = 5000 - lag = 10 - emb_dim = 5 - data = datasets.lorenz_euler(n + discard, 10, 28, 8/3, start=(1,1,1), dt=0.012) - x = data[discard:,1] - rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) - self.assertAlmostEqual(cd, 2.05, delta=0.2) - - def test_logistic(self) -> None: - # TODO replicate tests with logistic map from grassberger-procaccia - pass + """Tests for corr_dim.""" + + def test_corr_dim(self) -> None: + np.random.seed(5) + n = 1000 + data = np.arange(n) + cd = nolds.corr_dim(data, 4) + self.assertAlmostEqual(cd, 1, delta=0.05) + # TODO what is the prescribed correlation dimension for random data? + data = np.random.random(n) + cd = nolds.corr_dim(data, 4, fit="poly") + self.assertAlmostEqual(cd, 0.5, delta=0.15) + # TODO test example for cd > 1 + + def test_lorenz(self) -> None: + """Test hypothesis: We get correct values for estimating the correlation dimension of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Grassberger and Procaccia (1983) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. The settings of n, discard, + lag, emb_dim, and rvals were determined experimentally to find the smallest + dataset that yields the results reported. + + .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness + of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + """ + discard = 5000 + n = 5000 + lag = 10 + emb_dim = 5 + data = datasets.lorenz_euler(n + discard, 10, 28, 8 / 3, start=(1, 1, 1), dt=0.012) + x = data[discard:, 1] + rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally + cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) + self.assertAlmostEqual(cd, 2.05, delta=0.2) + + def test_logistic(self) -> None: + # TODO replicate tests with logistic map from grassberger-procaccia + pass class TestNoldsSampEn(unittest.TestCase): - """Tests for sampen.""" - def test_sampen_base(self) -> None: - data = [0, 1, 5, 4, 1, 0, 1, 5, 3] - # matches for m=2: 01-01, 15-15 - # matches for m=3: 015-015 - se = nolds.sampen(data) - self.assertAlmostEqual(se, -np.log(1.0/2), delta=0.01) - data = [1, 2, 1, 2.4, 1, 4] - # matches for m=1: 1-1,1-1,2-2.4,1-1 - # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] - se = nolds.sampen(data, emb_dim=1, tolerance=0.5) - self.assertAlmostEqual(se, -np.log(2.0/4), delta=0.01) - data = [0, 20, 1, 2, 3, 4, 40, 60, 1.4, 2.4, 3.4, 80, 100, 1.4, 2.4, 3.4, - 4, 120, 140, 180] - # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], - # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] - # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] - se = nolds.sampen(data, emb_dim=3, tolerance=0.5) - self.assertAlmostEqual(se, -np.log(1.0/4), delta=0.01) - - def test_sampen_logistic(self) -> None: - # logistic map with r = 2.8 => static value - data = list(datasets.logistic_map(0.45, 1000, r=2.8)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.3 => oscillation between two values - data = list(datasets.logistic_map(0.45, 1000, r=3.3)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.5 => oscillation between four values - data = list(datasets.logistic_map(0.45, 1000, r=3.5)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) - # logistic map with r = 3.9 => chaotic behavior - data = list(datasets.logistic_map(0.45, 1000, r=3.9)) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:]), delta=0.1) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:], emb_dim=5), delta=0.1) - - def test_sampen_random(self) -> None: - np.random.seed(6) - # normally distributed random numbers - data = np.random.randn(10000) - self.assertAlmostEqual(2.2, nolds.sampen(data), delta=0.1) - self.assertAlmostEqual(2.2, nolds.sampen(data, emb_dim=2), delta=0.1) - # TODO add tests with uniformly distributed random numbers - - def test_sampen_sinus(self) -> None: - # TODO add test with sinus signal - pass - - - def test_sampen_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. - - All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) - as closely as possible. - - For performance reasons the size of the input data was reduced and therefore the - assert conditions needed to be relaxed a bit. - - .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, - “The effect of time delay on Approximate & Sample Entropy - calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, - pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. - """ - data = datasets.lorenz_euler(3000, 10, 28, 8/3.0, start=[1,1,1], dt=0.01)[1000:] - sampen_args = {"emb_dim": 2, "lag": 1} - sx = nolds.sampen(data[:, 0], **sampen_args) - sy = nolds.sampen(data[:, 1], **sampen_args) - sz = nolds.sampen(data[:, 2], **sampen_args) - self.assertAlmostEqual(0.15, sx, delta=0.05) - self.assertAlmostEqual(0.15, sy, delta=0.05) - self.assertAlmostEqual(0.25, sz, delta=0.05) + """Tests for sampen.""" + + def test_sampen_base(self) -> None: + data = [0, 1, 5, 4, 1, 0, 1, 5, 3] + # matches for m=2: 01-01, 15-15 + # matches for m=3: 015-015 + se = nolds.sampen(data) + self.assertAlmostEqual(se, -np.log(1.0 / 2), delta=0.01) + data = [1, 2, 1, 2.4, 1, 4] + # matches for m=1: 1-1,1-1,2-2.4,1-1 + # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] + se = nolds.sampen(data, emb_dim=1, tolerance=0.5) + self.assertAlmostEqual(se, -np.log(2.0 / 4), delta=0.01) + data = [0, 20, 1, 2, 3, 4, 40, 60, 1.4, 2.4, 3.4, 80, 100, 1.4, 2.4, 3.4, 4, 120, 140, 180] + # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], + # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] + # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] + se = nolds.sampen(data, emb_dim=3, tolerance=0.5) + self.assertAlmostEqual(se, -np.log(1.0 / 4), delta=0.01) + + def test_sampen_logistic(self) -> None: + # logistic map with r = 2.8 => static value + data = list(datasets.logistic_map(0.45, 1000, r=2.8)) + self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) + self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + # logistic map with r = 3.3 => oscillation between two values + data = list(datasets.logistic_map(0.45, 1000, r=3.3)) + self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) + self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + # logistic map with r = 3.5 => oscillation between four values + data = list(datasets.logistic_map(0.45, 1000, r=3.5)) + self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) + self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + # logistic map with r = 3.9 => chaotic behavior + data = list(datasets.logistic_map(0.45, 1000, r=3.9)) + self.assertAlmostEqual(0.5, nolds.sampen(data[100:]), delta=0.1) + self.assertAlmostEqual(0.5, nolds.sampen(data[100:], emb_dim=5), delta=0.1) + + def test_sampen_random(self) -> None: + np.random.seed(6) + # normally distributed random numbers + data = np.random.randn(10000) + self.assertAlmostEqual(2.2, nolds.sampen(data), delta=0.1) + self.assertAlmostEqual(2.2, nolds.sampen(data, emb_dim=2), delta=0.1) + # TODO add tests with uniformly distributed random numbers + + def test_sampen_sinus(self) -> None: + # TODO add test with sinus signal + pass + + def test_sampen_lorenz(self) -> None: + """Test hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. + + All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) + as closely as possible. + + For performance reasons the size of the input data was reduced and therefore the + assert conditions needed to be relaxed a bit. + + .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, + “The effect of time delay on Approximate & Sample Entropy + calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, + pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. + """ + data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] + sampen_args = {"emb_dim": 2, "lag": 1} + sx = nolds.sampen(data[:, 0], **sampen_args) + sy = nolds.sampen(data[:, 1], **sampen_args) + sz = nolds.sampen(data[:, 2], **sampen_args) + self.assertAlmostEqual(0.15, sx, delta=0.05) + self.assertAlmostEqual(0.15, sy, delta=0.05) + self.assertAlmostEqual(0.25, sz, delta=0.05) class RegressionTests(unittest.TestCase): - """Regression tests for main algorithms. - - These tests are here to safeguard against accidental algorithmic changes such - as updates to core dependencies such as numpy or the Python standard library. - """ - - def test_sampen(self) -> None: - """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - se = nolds.sampen(data, emb_dim=2, tolerance=None, lag=1, dist=nolds.rowwise_chebyshev, closed=False) - self.assertAlmostEqual(2.1876999522832743, se, places=14) - - def test_corr_dim(self) -> None: - """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=None, dist=nolds.rowwise_euclidean, fit="poly") - self.assertAlmostEqual(1.303252839255068, cd, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_corr_dim_RANSAC(self) -> None: - """Test hypothesis: The exact output of corr_dim() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - sd = np.std(data, ddof=1) - # fix seed - np.random.seed(42) - # usa a too wide range for rvals to give RANSAC something to do ;) - rvals = nolds.logarithmic_r(0.01 * sd, 2 * sd, 1.03) - cd = nolds.corr_dim(data, emb_dim=5, lag=1, rvals=rvals, dist=nolds.rowwise_euclidean, fit="RANSAC") - self.assertAlmostEqual(0.44745494643404665, cd, places=14) - - def test_lyap_e(self) -> None: - """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - le = nolds.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) - expected = np.array([ 0.03779942603329712, -0.014314012551504982, -0.08436867977030214, -0.22316730257003717]) - for i in range(le.shape[0]): - self.assertAlmostEqual(expected[i], le[i], places=14, msg=f"{i+1}th Lyapunov exponent doesn't match") - - def test_lyap_r(self) -> None: - """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - le = nolds.lyap_r(data, emb_dim=10, lag=1, min_tsep=1, tau=1, min_neighbors=10, trajectory_len=10, fit="poly") - expected = 0.094715945307378 - self.assertAlmostEqual(expected, le, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_lyap_r_RANSAC(self) -> None: - """Test hypothesis: The exact output of lyap_r() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - # set lag to 2 for weird duplicate lines - # set trajectory_len to 100 to get many datapoints for RANSAC to choose from - le = nolds.lyap_r(data, emb_dim=10, lag=2, min_tsep=1, tau=1, min_neighbors=10, trajectory_len=100, fit="RANSAC") - expected = 0.0003401212353253564 - self.assertAlmostEqual(expected, le, places=14) - - def test_hurst_rs(self) -> None: - """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - rs = nolds.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) - expected = 0.5123887964986258 - self.assertAlmostEqual(expected, rs, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_hurst_rs_RANSAC(self) -> None: - """Test hypothesis: The exact output of hurst_rs() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - # increase nsteps in nvals to have more data points for RANSAC to choose from - nvals = nolds.logmid_n(data.shape[0], ratio=1/4.0, nsteps=100) - rs = nolds.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) - expected = 0.4805431939943321 - self.assertAlmostEqual(expected, rs, places=14) - - def test_dfa(self) -> None: - """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = nolds.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") - expected = 0.5450874638765073 - self.assertAlmostEqual(expected, h, places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_dfa_RANSAC(self) -> None: - """Test hypothesis: The exact output of dfa() with `fit_exp=RANSAC` on random data hasn't changed since the last version.""" - # adds trend to data to introduce a less clear line for fitting - data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 - np.random.seed(42) - # adds more steps and higher values to nvals to introduce some scattering for RANSAC to have an effect on - nvals = nolds.logarithmic_n(10, 0.9 * data.shape[0], 1.1) - h = nolds.dfa(data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC") - expected = 1.1372303125405405 - self.assertAlmostEqual(expected, h, places=14) - - def test_mfhurst_b(self) -> None: - """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="poly") - expected = [-0.00559398934417339] - self.assertAlmostEqual(expected[0], h[0], places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_b_RANSAC(self) -> None: - """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") - expected = [-0.009056463064211057] - self.assertAlmostEqual(expected[0], h[0], places=14) - - def test_mfhurst_dm(self) -> None: - """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly") - expected = [0.008762803881203145] - self.assertAlmostEqual(expected[0], h[0], places=14) - - @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_dm_RANSAC(self) -> None: - """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC") - expected = [0.005324834328837356] - self.assertAlmostEqual(expected[0], h[0], places=14) + """Regression tests for main algorithms. + + These tests are here to safeguard against accidental algorithmic changes such + as updates to core dependencies such as numpy or the Python standard library. + """ + + def test_sampen(self) -> None: + """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + se = nolds.sampen( + data, emb_dim=2, tolerance=None, lag=1, dist=nolds.rowwise_chebyshev, closed=False + ) + self.assertAlmostEqual(2.1876999522832743, se, places=14) + + def test_corr_dim(self) -> None: + """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + cd = nolds.corr_dim( + data, emb_dim=5, lag=1, rvals=None, dist=nolds.rowwise_euclidean, fit="poly" + ) + self.assertAlmostEqual(1.303252839255068, cd, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_corr_dim_RANSAC(self) -> None: + """Test hypothesis: The exact output of corr_dim() with `fit=RANSAC` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + sd = np.std(data, ddof=1) + # fix seed + np.random.seed(42) + # usa a too wide range for rvals to give RANSAC something to do ;) + rvals = nolds.logarithmic_r(0.01 * sd, 2 * sd, 1.03) + cd = nolds.corr_dim( + data, emb_dim=5, lag=1, rvals=rvals, dist=nolds.rowwise_euclidean, fit="RANSAC" + ) + self.assertAlmostEqual(0.44745494643404665, cd, places=14) + + def test_lyap_e(self) -> None: + """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + le = nolds.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) + expected = np.array( + [0.03779942603329712, -0.014314012551504982, -0.08436867977030214, -0.22316730257003717] + ) + for i in range(le.shape[0]): + self.assertAlmostEqual( + expected[i], le[i], places=14, msg=f"{i + 1}th Lyapunov exponent doesn't match" + ) + + def test_lyap_r(self) -> None: + """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + le = nolds.lyap_r( + data, + emb_dim=10, + lag=1, + min_tsep=1, + tau=1, + min_neighbors=10, + trajectory_len=10, + fit="poly", + ) + expected = 0.094715945307378 + self.assertAlmostEqual(expected, le, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_lyap_r_RANSAC(self) -> None: + """Test hypothesis: The exact output of lyap_r() with `fit=RANSAC` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + np.random.seed(42) + # set lag to 2 for weird duplicate lines + # set trajectory_len to 100 to get many datapoints for RANSAC to choose from + le = nolds.lyap_r( + data, + emb_dim=10, + lag=2, + min_tsep=1, + tau=1, + min_neighbors=10, + trajectory_len=100, + fit="RANSAC", + ) + expected = 0.0003401212353253564 + self.assertAlmostEqual(expected, le, places=14) + + def test_hurst_rs(self) -> None: + """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + rs = nolds.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) + expected = 0.5123887964986258 + self.assertAlmostEqual(expected, rs, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_hurst_rs_RANSAC(self) -> None: + """Test hypothesis: The exact output of hurst_rs() with `fit=RANSAC` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + np.random.seed(42) + # increase nsteps in nvals to have more data points for RANSAC to choose from + nvals = nolds.logmid_n(data.shape[0], ratio=1 / 4.0, nsteps=100) + rs = nolds.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) + expected = 0.4805431939943321 + self.assertAlmostEqual(expected, rs, places=14) + + def test_dfa(self) -> None: + """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + h = nolds.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") + expected = 0.5450874638765073 + self.assertAlmostEqual(expected, h, places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_dfa_RANSAC(self) -> None: + """Test hypothesis: The exact output of dfa() with `fit_exp=RANSAC` on random data hasn't changed since the last version.""" + # adds trend to data to introduce a less clear line for fitting + data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 + np.random.seed(42) + # adds more steps and higher values to nvals to introduce some scattering for RANSAC to have an effect on + nvals = nolds.logarithmic_n(10, 0.9 * data.shape[0], 1.1) + h = nolds.dfa(data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC") + expected = 1.1372303125405405 + self.assertAlmostEqual(expected, h, places=14) + + def test_mfhurst_b(self) -> None: + """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="poly") + expected = [-0.00559398934417339] + self.assertAlmostEqual(expected[0], h[0], places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_mfhurst_b_RANSAC(self) -> None: + """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + np.random.seed(42) + h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") + expected = [-0.009056463064211057] + self.assertAlmostEqual(expected[0], h[0], places=14) + + def test_mfhurst_dm(self) -> None: + """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly") + expected = [0.008762803881203145] + self.assertAlmostEqual(expected[0], h[0], places=14) + + @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") + def test_mfhurst_dm_RANSAC(self) -> None: + """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" + data = datasets.load_qrandom()[:1000] + np.random.seed(42) + h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC") + expected = [0.005324834328837356] + self.assertAlmostEqual(expected[0], h[0], places=14) class PreviousDefectTests(unittest.TestCase): - """Tests that ensure that a previous bug doesn't come back at some point.""" + """Tests that ensure that a previous bug doesn't come back at some point.""" - def test_lyap_r_complex_min_tsep(self) -> None: - """Test hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. + def test_lyap_r_complex_min_tsep(self) -> None: + """Test hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. + + Previously, this would lead to an exception in the code. See + https://github.com/CSchoel/nolds/issues/53 for reference. + """ + data = np.cos(np.arange(100) * 0.01) + # previously this would fail with the following exception: + # TypeError: ufunc 'ceil' not supported for the input types, and the + # inputs could not be safely coerced to any supported types according to + # the casting rule ''safe'' + nolds.lyap_r(data) - Previously, this would lead to an exception in the code. See - https://github.com/CSchoel/nolds/issues/53 for reference. - """ - data = np.cos(np.arange(100)*0.01) - # previously this would fail with the following exception: - # TypeError: ufunc 'ceil' not supported for the input types, and the - # inputs could not be safely coerced to any supported types according to - # the casting rule ''safe'' - nolds.lyap_r(data) if __name__ == "__main__": - unittest.main() + unittest.main() diff --git a/pyproject.toml b/pyproject.toml index 66fd0b1..d342702 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ select = ["ALL"] ignore = [ "TD002", # adding an author to TODOs wastes space and is redundant because of git history "TD003", # once there is an issue, there is no need to keep the TODO => don't require links + "COM812", # missing trailing commas are already fixed by the formatter ] [tool.ruff.lint.pydocstyle] From 2e2ff699e1db55f78af8748235a503c1743565b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Wed, 11 Jun 2025 22:08:25 +0200 Subject: [PATCH 03/36] fixes remaining linter errors --- nolds/__init__.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/nolds/__init__.py b/nolds/__init__.py index 25faddb..4ebe796 100644 --- a/nolds/__init__.py +++ b/nolds/__init__.py @@ -1,3 +1,5 @@ +"""Main module for nolds, containing all important user-facing API elements.""" + from .datasets import ( barabasi1991_fractal, brown72, @@ -29,3 +31,33 @@ rowwise_euclidean, sampen, ) + +__all__ = [ + "barabasi1991_fractal", + "binary_n", + "brown72", + "corr_dim", + "dfa", + "expected_h", + "expected_rs", + "fbm", + "fgn", + "hurst_rs", + "load_financial", + "load_qrandom", + "logarithmic_n", + "logarithmic_r", + "logistic_map", + "logmid_n", + "lyap_e", + "lyap_e_len", + "lyap_r", + "lyap_r_len", + "mfhurst_b", + "mfhurst_dm", + "qrandom", + "rowwise_chebyshev", + "rowwise_euclidean", + "sampen", + "tent_map", +] From ffb6e572e5980583143e44cd06bb7e119dc9fcfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Wed, 11 Jun 2025 22:48:47 +0200 Subject: [PATCH 04/36] fixes linting errors (wip) --- nolds/datasets.py | 136 ++++++++++++++++++++++++++++++---------------- 1 file changed, 88 insertions(+), 48 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index 7e5a6af..fd928f3 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -1,10 +1,21 @@ +"""Contains functions to load example datasets used in nolds.""" + +from __future__ import annotations + import datetime import numpy as np import pkg_resources -def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=None): +def lorenz_euler( + length: int, + sigma: float, + rho: float, + beta: float, + dt: float = 0.01, + start: list[float] | None = None, +) -> np.ndarray[tuple[int, int], np.dtype[np.float32]]: """Simulates the Lorenz system using a simple Euler method. The Lorenz system is a three dimensional dynamical system given @@ -13,11 +24,24 @@ def lorenz_euler(length, sigma, rho, beta, dt=0.01, start=None): dx/dt = sigma * (y - x) dy/dt = rho * x - y - x * z dz/dt = x * y - beta * z + + Args: + length: Number of data points to generate. + sigma: Sigma parameter of the Lorenz system. + rho: Rho parameter of the Lorenz system. + beta: Beta paramete rof the Lorenz system. + dt: Time delta between two data points. + start: Optional starting point for the trajectory. + + Returns: + 2d-array of (x, y, z) data points in the simulated Lorenz system. """ if start is None: start = [1, 1, 1] - def lorenz(state, sigma, rho, beta): + def lorenz( + state: np.ndarray[tuple[int], np.dtype[np.float32]], sigma: float, rho: float, beta: float + ) -> np.ndarray[tuple[int], np.dtype[np.float32]]: x, y, z = state # NOTE: Numpy 1.x stores intermediate results as float64 # => to achieve consistency between numpy versions, we have to use @@ -29,90 +53,106 @@ def lorenz(state, sigma, rho, beta): np.float32(rho) * x - y - x * z, x * y - np.float32(beta) * z, ], - dtype="float32", + dtype=np.float32, ) - trajectory = np.zeros((length, 3), dtype="float32") + trajectory = np.zeros((length, 3), dtype=np.float32) trajectory[0] = start for i in range(1, length): - # t = i * dt trajectory[i] = trajectory[i - 1] + lorenz(trajectory[i - 1], sigma, rho, beta) * dt return trajectory -def lorenz_lyap(sigma, rho, beta): - """Calculates the exact Lyapunov dimension of the Lorenz system according to - Leonov 2015 [ll_1]_. +def lorenz_lyap(sigma: float, rho: float, beta: float) -> float: + """Calculate the exact Lyapunov dimension of the Lorenz system. + + This uses the definition according to Leonov 2015 [ll_1]_. + + Args: + sigma: Sigma parameter of the Lorenz system. + rho: Rho parameter of the Lorenz system. + beta: Beta paramete rof the Lorenz system. + + Returns: + Prescribed Lyapunov dimension for the Lorenz system according to Leonov 2015. References: - .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the - analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, - vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. + .. [ll_1] G. A. Leonov and N. V. Kuznetsov, “On differences and similarities in the + analysis of Lorenz, Chen, and Lu systems,” Applied Mathematics and Computation, + vol. 256, pp. 334–343, Apr. 2015, doi: 10.1016/j.amc.2014.12.132. """ return 3 - 2 * (sigma + beta + 1) / (sigma + 1 + np.sqrt((sigma - 1) ** 2 + 4 * sigma * rho)) -def fbm(n, H=0.75): +def fbm( + n: int, + H: float = 0.75, # noqa: N803 + random_seed: int | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Generates fractional brownian motions of desired length. Author: - Christian Thomae + Christian Thomae References: - .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation + .. [fbm_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion#Method_1_of_simulation Args: - n (int): - length of sequence to generate - Kwargs: - H (float): - hurst parameter + n: Length of sequence to generate. + H: Hurst parameter. + random_seed: Seed used for random number generation. Returns: - array of float: - simulated fractional brownian motion + array of float: + simulated fractional brownian motion """ - # TODO more detailed description of fbm - assert H > 0 - assert H < 1 - - def R(t, s): - twoH = 2 * H + if H < 0 or H > 1: + msg = f"H must be between 0 and 1, got {H} instead." + raise ValueError(msg) + + def R( # noqa: N802 + t: np.ndarray[tuple[int], np.dtype[np.float64]], + s: np.ndarray[tuple[int], np.dtype[np.float64]], + ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + twoH = 2 * H # noqa: N806 return 0.5 * (s**twoH + t**twoH - np.abs(t - s) ** twoH) # form the matrix tau gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix - w, P = np.linalg.eigh(gamma) - L = np.diag(w) + w, P = np.linalg.eigh(gamma) # noqa: N806 + L = np.diag(w) # noqa: N806 sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) - v = np.random.randn(n) + gen = np.random.default_rng(seed=random_seed) + v = gen.standard_normal(n) return np.dot(sigma, v) -def fgn(n, H=0.75): +def fgn( + n: int, + H: float = 0.75, # noqa: N803 + random_seed: int | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Generates fractional gaussian noise of desired length. References: .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion Args: - n (int): - length of sequence to generate - - Kwargs: - H (float): - hurst parameter + n: Length of sequence to generate. + H: Hurst parameter. + random_seed: Seed used for random number generation. Returns: array of float: simulated fractional gaussian noise """ - return np.diff(fbm(n + 1, H=H)) + return np.diff(fbm(n + 1, H=H, random_seed=random_seed)) + +def qrandom(n: int) -> np.ndarray[tuple[int], np.dtype[np.uint16]]: + """Creates an array of n true random numbers. -def qrandom(n): - """Creates an array of n true random numbers obtained from the quantum random - number generator at qrng.anu.edu.au. + The data is obtained from the quantum random number generator at qrng.anu.edu.au. This function requires the package quantumrandom and an internet connection. @@ -249,7 +289,7 @@ def tent_map(x, steps, mu=2): def logistic_map(x, steps, r=4): r"""Generates a time series of the logistic map. - Characteristics and Background: + Characteristics and Background: The logistic map is among the simplest examples for a time series that can exhibit chaotic behavior depending on the parameter r. For r between 2 and 3, the series quickly becomes static. At r=3 the first bifurcation point is @@ -257,7 +297,7 @@ def logistic_map(x, steps, r=4): it shows chaotic behavior with a few islands of stability until perfect chaos is achieved at r = 4. - Calculating the Lyapunov exponent: + Calculating the Lyapunov exponent: To calculate the "true" Lyapunov exponent of the logistic map, we first have to make a few observations for maps in general that are repeated applications of a function to a starting value. @@ -314,24 +354,24 @@ def logistic_map(x, steps, r=4): - References: + References: .. [lm_1] https://en.wikipedia.org/wiki/Tent_map .. [lm_2] https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ - Args: + Args: x (float): starting point steps (int): number of steps for which the generator should run - Kwargs: + Kwargs: r (int): parameter r that controls the behavior of the map - Returns: + Returns: generator object: the generator that creates the time series - """ + """ for _ in range(steps): x = r * x * (1 - x) yield x From 2fdd1acee5aedb6dca06942fa93f3820ed4cf3b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 15 Jun 2025 22:09:23 +0200 Subject: [PATCH 05/36] swaps deprecated pkg_resources for importlib.resources --- nolds/datasets.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index fd928f3..e15be2c 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -3,9 +3,9 @@ from __future__ import annotations import datetime +import importlib.resources import numpy as np -import pkg_resources def lorenz_euler( @@ -185,7 +185,7 @@ def load_qrandom(): the dataset """ fname = "datasets/qrandom.npy" - with pkg_resources.resource_stream(__name__, fname) as f: + with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: return np.load(f) @@ -199,7 +199,7 @@ def load_brown72(): the dataset """ fname = "datasets/brown72.npy" - with pkg_resources.resource_stream(__name__, fname) as f: + with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: return np.load(f) @@ -224,10 +224,10 @@ def load_lorenz_physionet(): x- and y-coordinates of the line fitting step in the PhysioNet output """ fname = "datasets/lorenz.txt" - with pkg_resources.resource_stream(__name__, fname) as f: + with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: data_in = np.loadtxt(f) fname = "datasets/lorenz_physionet.txt" - with pkg_resources.resource_stream(__name__, fname) as f: + with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: data_out = np.loadtxt(f) return data_in, data_out @@ -430,7 +430,7 @@ def pad_opening_values(values) -> None: data = [] for index in ["^JKSE", "^N225", "^NDX"]: fname = f"datasets/{index}.csv" - with pkg_resources.resource_stream(__name__, fname) as f: + with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: days, values = load_finance_yahoo_data(f) pad_opening_values(values) data.append((days, values)) From 10e95dbd505918d8768c9b19d903b3fad571cdf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 15 Jun 2025 22:11:38 +0200 Subject: [PATCH 06/36] bugfix: need to specify package, not module --- nolds/datasets.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index e15be2c..c2c7841 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -185,7 +185,7 @@ def load_qrandom(): the dataset """ fname = "datasets/qrandom.npy" - with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: return np.load(f) @@ -199,7 +199,7 @@ def load_brown72(): the dataset """ fname = "datasets/brown72.npy" - with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: return np.load(f) @@ -224,10 +224,10 @@ def load_lorenz_physionet(): x- and y-coordinates of the line fitting step in the PhysioNet output """ fname = "datasets/lorenz.txt" - with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: data_in = np.loadtxt(f) fname = "datasets/lorenz_physionet.txt" - with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: data_out = np.loadtxt(f) return data_in, data_out @@ -430,7 +430,7 @@ def pad_opening_values(values) -> None: data = [] for index in ["^JKSE", "^N225", "^NDX"]: fname = f"datasets/{index}.csv" - with importlib.resources.files("nolds.datasets").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: days, values = load_finance_yahoo_data(f) pad_opening_values(values) data.append((days, values)) From 8d8f52d459790a0f1f325013fee227ad8d68c841 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 16 Jun 2025 15:40:08 +0200 Subject: [PATCH 07/36] turns on ruff formatter and fixes all linting errors --- .vscode/settings.json | 6 +- nolds/datasets.py | 305 +++++++++++++++++++++--------------------- pyproject.toml | 2 + 3 files changed, 160 insertions(+), 153 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index dbafbbd..834621f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -12,11 +12,11 @@ "test_*.py" ], "[python]": { - "editor.formatOnSave": false, // disable format on save for now to not mess up any files that haven't been converted yet + "editor.formatOnSave": true, "editor.defaultFormatter": "charliermarsh.ruff", "editor.codeActionsOnSave": { - "source.fixAll": "never", - "source.organizeImports": "never" + "source.fixAll": "explicit", + "source.organizeImports": "explicit" } } } \ No newline at end of file diff --git a/nolds/datasets.py b/nolds/datasets.py index c2c7841..c903066 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -2,8 +2,10 @@ from __future__ import annotations +import csv import datetime import importlib.resources +from typing import IO, Any, Generator import numpy as np @@ -103,8 +105,7 @@ def fbm( random_seed: Seed used for random number generation. Returns: - array of float: - simulated fractional brownian motion + Simulated fractional brownian motion """ if H < 0 or H > 1: msg = f"H must be between 0 and 1, got {H} instead." @@ -135,16 +136,15 @@ def fgn( """Generates fractional gaussian noise of desired length. References: - .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion + .. [fgn_1] https://en.wikipedia.org/wiki/Fractional_Brownian_motion Args: - n: Length of sequence to generate. - H: Hurst parameter. - random_seed: Seed used for random number generation. + n: Length of sequence to generate. + H: Hurst parameter. + random_seed: Seed used for random number generation. Returns: - array of float: - simulated fractional gaussian noise + Simulated fractional gaussian noise """ return np.diff(fbm(n + 1, H=H, random_seed=random_seed)) @@ -157,55 +157,49 @@ def qrandom(n: int) -> np.ndarray[tuple[int], np.dtype[np.uint16]]: This function requires the package quantumrandom and an internet connection. Args: - n (int): - length of the random array + n: length of the random array Return: - array of ints: - array of truly random unsigned 16 bit int values + Array of truly random unsigned 16 bit int values """ import quantumrandom return np.concatenate( [ - quantumrandom.get_data(data_type="uint16", array_length=1024) + np.array(quantumrandom.get_data(data_type="uint16", array_length=1024), dtype=np.uint16) for i in range(int(np.ceil(n / 1024.0))) ] )[:n] -def load_qrandom(): +def load_qrandom() -> np.ndarray[tuple[int], np.dtype[np.int32]]: """Loads a set of 10000 random numbers generated by qrandom. This dataset can be used when you want to do some limited tests with "true" random data without an internet connection. Returns: - int array - the dataset + Dataset of 10k quantum random numbers packaged with nolds. """ fname = "datasets/qrandom.npy" with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: return np.load(f) -def load_brown72(): - """Loads the dataset brown72 with a prescribed Hurst exponent of 0.72. +def load_brown72() -> np.ndarray[tuple[int], np.dtype[np.float64]]: + """Returns the dataset brown72 with a prescribed Hurst exponent of 0.72. Source: http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html - - Returns: - float array: - the dataset """ fname = "datasets/brown72.npy" with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: return np.load(f) -def load_lorenz_physionet(): - """Loads a dataset containing the X variable of the Lorenz system - as well as the output of PhysioNet's dfa implementation on that dataset. +def load_lorenz_physionet() -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int, int], np.dtype[np.float64]] +]: + """Loads a dataset of the Lorenz system (X variable) and PhysioNet's DFA output on that dataset. The input data was created with the following code: @@ -218,10 +212,10 @@ def load_lorenz_physionet(): dfa < lorenz.txt > lorenz_physionet.txt Returns: - 1d float array: - time series of the X variable of the Lorenz system that was used as input - 2d float array: - x- and y-coordinates of the line fitting step in the PhysioNet output + Tuple containing + + - time series of the X variable of the Lorenz system that was used as input + - x- and y-coordinates of the line fitting step in the PhysioNet output """ fname = "datasets/lorenz.txt" with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: @@ -232,153 +226,142 @@ def load_lorenz_physionet(): return data_in, data_out -def tent_map(x, steps, mu=2): +def tent_map(x: float, steps: int, mu: int = 2) -> Generator[float, None, None]: """Generates a time series of the tent map. Characteristics and Background: - The name of the tent map is derived from the fact that the plot of x_i vs - x_i+1 looks like a tent. For mu > 1 one application of the mapping function - can be viewed as stretching the surface on which the value is located and - then folding the area that is greater than one back towards the zero. This - corresponds nicely to the definition of chaos as expansion in one dimension - which is counteracted by a compression in another dimension. + The name of the tent map is derived from the fact that the plot of x_i vs + x_i+1 looks like a tent. For mu > 1 one application of the mapping function + can be viewed as stretching the surface on which the value is located and + then folding the area that is greater than one back towards the zero. This + corresponds nicely to the definition of chaos as expansion in one dimension + which is counteracted by a compression in another dimension. Calculating the Lyapunov exponent: - The lyapunov exponent of the tent map can be easily calculated as due to - this stretching behavior a small difference delta between two neighboring - points will indeed grow exponentially by a factor of mu in each iteration. - We thus can assume that: + The lyapunov exponent of the tent map can be easily calculated as due to + this stretching behavior a small difference delta between two neighboring + points will indeed grow exponentially by a factor of mu in each iteration. + We thus can assume that: - delta_n = delta_0 * mu^n + delta_n = delta_0 * mu^n - We now only have to change the basis to e to obtain the exact formula that - is used for the definition of the lyapunov exponent: + We now only have to change the basis to e to obtain the exact formula that + is used for the definition of the lyapunov exponent: - delta_n = delta_0 * e^(ln(mu) * n) + delta_n = delta_0 * e^(ln(mu) * n) - Therefore the lyapunov exponent of the tent map is: + Therefore the lyapunov exponent of the tent map is: - lambda = ln(mu) + lambda = ln(mu) References: - .. [tm_1] https://en.wikipedia.org/wiki/Tent_map + .. [tm_1] https://en.wikipedia.org/wiki/Tent_map Args: - x (float): - starting point - steps (int): - number of steps for which the generator should run - - Kwargs: - mu (int): - parameter mu that controls the behavior of the map + x: starting point + steps: number of steps for which the generator should run + mu: parameter mu that controls the behavior of the map - Returns: - generator object: - the generator that creates the time series + Yields: + The next value in the tent map series. """ for _ in range(steps): - x = mu * x if x < 0.5 else mu * (1 - x) + x = mu * x if x < 0.5 else mu * (1 - x) # noqa: PLR2004 yield x -# TODO should all math be formatted like this, or should the documentation of -# logistic_map revert to a version that is more readable as plain text - - -def logistic_map(x, steps, r=4): +def logistic_map(x: float, steps: int, r: float = 4) -> Generator[float, None, None]: r"""Generates a time series of the logistic map. Characteristics and Background: - The logistic map is among the simplest examples for a time series that can - exhibit chaotic behavior depending on the parameter r. For r between 2 and - 3, the series quickly becomes static. At r=3 the first bifurcation point is - reached after which the series starts to oscillate. Beginning with r = 3.6 - it shows chaotic behavior with a few islands of stability until perfect - chaos is achieved at r = 4. + The logistic map is among the simplest examples for a time series that can + exhibit chaotic behavior depending on the parameter r. For r between 2 and + 3, the series quickly becomes static. At r=3 the first bifurcation point is + reached after which the series starts to oscillate. Beginning with r = 3.6 + it shows chaotic behavior with a few islands of stability until perfect + chaos is achieved at r = 4. Calculating the Lyapunov exponent: - To calculate the "true" Lyapunov exponent of the logistic map, we first - have to make a few observations for maps in general that are repeated - applications of a function to a starting value. + To calculate the "true" Lyapunov exponent of the logistic map, we first + have to make a few observations for maps in general that are repeated + applications of a function to a starting value. - If we have two starting values that differ by some infinitesimal - :math:`delta_0` then according to the definition of the lyapunov exponent - we will have an exponential divergence: + If we have two starting values that differ by some infinitesimal + :math:`delta_0` then according to the definition of the lyapunov exponent + we will have an exponential divergence: - .. math:: - |\delta_n| = |\delta_0| e^{\lambda n} + .. math:: + |\delta_n| = |\delta_0| e^{\lambda n} - We can now write that: + We can now write that: - .. math:: - e^{\lambda n} = \lim_{\delta_0 -> 0} |\frac{\delta_n}{\delta_0}| + .. math:: + e^{\lambda n} = \lim_{\delta_0 -> 0} |\frac{\delta_n}{\delta_0}| - This is the definition of the derivative :math:`\frac{dx_n}{dx_0}` of a - point :math:`x_n` in the time series with respect to the starting point - :math:`x_0` (or rather the absolute value of that derivative). Now we can - use the fact that due to the definition of our map as repetitive - application of some f we have: + This is the definition of the derivative :math:`\frac{dx_n}{dx_0}` of a + point :math:`x_n` in the time series with respect to the starting point + :math:`x_0` (or rather the absolute value of that derivative). Now we can + use the fact that due to the definition of our map as repetitive + application of some f we have: - .. math:: - f^{n\prime}(x) = f(f(f(...f(x_0)...))) = f'(x_n-1) \cdot f'(x_n-2) - \cdot ... \cdot f'(x_0) + .. math:: + f^{n\prime}(x) = f(f(f(...f(x_0)...))) = f'(x_n-1) \cdot f'(x_n-2) + \cdot ... \cdot f'(x_0) - with + with - .. math:: - e^{\lambda n} = |f^{n\prime}(x)| + .. math:: + e^{\lambda n} = |f^{n\prime}(x)| - we now have + we now have - .. math:: + .. math:: - e^{\lambda n} &= |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - \Leftrightarrow \\ - \lambda n &= \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - \Leftrightarrow \\ - \lambda &= \frac{1}{n} \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ - &= \frac{1}{n} \sum_{k=0}^{n-1} \ln |f'(x_k)| + e^{\lambda n} &= |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + \Leftrightarrow \\ + \lambda n &= \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + \Leftrightarrow \\ + \lambda &= \frac{1}{n} \ln |f'(x_n-1) \cdot f'(x_n-2) \cdot ... \cdot f'(x_0)| \\ + &= \frac{1}{n} \sum_{k=0}^{n-1} \ln |f'(x_k)| - With this sum we can now calculate the lyapunov exponent for any map. - For the logistic map we simply have to calculate :math:`f'(x)` and as we - have + With this sum we can now calculate the lyapunov exponent for any map. + For the logistic map we simply have to calculate :math:`f'(x)` and as we + have - .. math:: - f(x) = r x (1-x) = rx - rx² + .. math:: + f(x) = r x (1-x) = rx - rx² - we now get + we now get - .. math:: - f'(x) = r - 2 rx + .. math:: + f'(x) = r - 2 rx References: - .. [lm_1] https://en.wikipedia.org/wiki/Tent_map - .. [lm_2] https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ + .. [lm_1] https://en.wikipedia.org/wiki/Tent_map + .. [lm_2] https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ Args: - x (float): - starting point - steps (int): - number of steps for which the generator should run - - Kwargs: - r (int): - parameter r that controls the behavior of the map + x: starting point + steps: number of steps for which the generator should run + r: parameter r that controls the behavior of the map - Returns: - generator object: - the generator that creates the time series + Yields: + The next value in the logistic map time series. """ for _ in range(steps): x = r * x * (1 - x) yield x -def load_financial(): - """Loads the following datasets from CSV files in this package: +def load_financial() -> list[ + tuple[ + np.ndarray[tuple[int], np.dtype[np.datetime64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ] +]: + """Loads three financial datasets from CSV files in this package. - jkse: Jakarta Composite Index, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EJKSE/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d - n225: Nikkei 225, downloaded on 2019-02-12 from https://finance.yahoo.com/quote/%5EN225/history?period1=631148400&period2=988668000&interval=1d&filter=history&frequency=1d @@ -394,26 +377,29 @@ def load_financial(): day where data was available. Returns: - list of tuple(1d-array, 2d-array): - datasets with days as array of date objects and 2d-array with the columns + Datasets with days as array of date objects and 2d-array with the columns "Open", "High", "Low", "Close", "Adj Close", and "Volume". Note that "Open" values have been padded to ensure that there are no NaNs left. """ - def load_finance_yahoo_data(f): - f.readline() + def load_finance_yahoo_data( + f: IO[str], + ) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.datetime64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ]: days = [] values = [] - for l in f: - fields = l.decode("utf-8") - fields = fields.split(",") - d = datetime.datetime.strptime(fields[0], "%Y-%m-%d") - v = [np.nan if x.strip() == "null" else float(x) for x in fields[1:]] + reader = csv.reader(f, delimiter=",") + next(reader) # skip header row + for row in reader: + d = datetime.datetime.strptime(row[0], "%Y-%m-%d").astimezone(datetime.timezone.utc) + v = [np.nan if x.strip() == "null" else float(x) for x in row[1:]] days.append(d) values.append(v) return np.array(days), np.array(values) - def pad_opening_values(values) -> None: + def pad_opening_values(values: np.ndarray[tuple[int, int], np.dtype[np.float64]]) -> None: # fill first value from future if required first = 0 while np.isnan(values[first, 0]): @@ -430,14 +416,16 @@ def pad_opening_values(values) -> None: data = [] for index in ["^JKSE", "^N225", "^NDX"]: fname = f"datasets/{index}.csv" - with importlib.resources.files("nolds").joinpath(fname).open("rb") as f: + with importlib.resources.files("nolds").joinpath(fname).open("r", encoding="utf-8") as f: days, values = load_finance_yahoo_data(f) pad_opening_values(values) data.append((days, values)) return data -def barabasi1991_fractal(size, iterations, b1=0.8, b2=0.5): +def barabasi1991_fractal( + size: int, iterations: int, b1: float = 0.8, b2: float = 0.5 +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Generates the simple fractal described in [bf]_. The fractal divides a rectangular segment starting at (x0, y0) with width w @@ -451,27 +439,24 @@ def barabasi1991_fractal(size, iterations, b1=0.8, b2=0.5): from the previous iteration are subdivided according to the same rule. References: - .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + .. [bf] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. Args: - size (int): - number of data points in the resulting array - iterations (int): - number of iterations to perform + size: number of data points in the resulting array + iterations: number of iterations to perform Kwargs: - b1 (float): - relative height at x1 (between 0 and 1) - b2 (float): - relative height at x3 (between 0 and 1) + b1: relative height at x1 (between 0 and 1) + b2: relative height at x3 (between 0 and 1) Returns: - (1d-array of float): - generated fractal + The generated fractal """ - def b1991(x0, y0, w, h): + def b1991( + x0: int, y0: int, w: int, h: int + ) -> tuple[np.ndarray[tuple[int], np.dtype[np.float64]], list[int]]: if h < 0: # for a segment with negative slope we have flip the x-axis d, nxtp = b1991(x0, y0 + h, w, -h) @@ -501,5 +486,25 @@ def b1991(x0, y0, w, h): return fractal -brown72 = load_brown72() -jkse, n225, ndx = load_financial() +def __getattr__(name: str) -> Any: # noqa: ANN401 + """Provide datasets as variables via lazy evaluation. + + See https://peps.python.org/pep-0562/ for documentation of this type of + module-level __getattr__. + """ + if name == "brown72": + global brown72 # noqa: PLW0603 + brown72 = load_brown72() + return brown72 + if name in ["jkse", "n225", "ndx"]: + global jkse, n225, ndx + jkse, n225, ndx = load_financial() + match name: + case "jkse": + return jkse + case "n225": + return n225 + case "ndx": + return ndx + msg = f"module {__name__!r} has no attribute {name!r}" + raise AttributeError(msg) diff --git a/pyproject.toml b/pyproject.toml index d342702..6a08c62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,8 @@ ignore = [ "TD002", # adding an author to TODOs wastes space and is redundant because of git history "TD003", # once there is an issue, there is no need to keep the TODO => don't require links "COM812", # missing trailing commas are already fixed by the formatter + "PLR0913", # this is a scientific library, functions just have a bazillion parameters + "RUF002", # en-dashes are used for bibliographical references in docstrings ] [tool.ruff.lint.pydocstyle] From ee4946ba001b26413e5698af8b59ed820ceceb38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 16 Jun 2025 15:40:15 +0200 Subject: [PATCH 08/36] updates changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f666c12..c7e243b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](http://semver.org/). * Switches from using `setup.py` to `pyproject.toml` using `uv`. * Moves ruff config into `pyproject.toml`. +* CSV-based datasets are now loaded with `csv.reader`. +* Datasets that are available as global variables are loaded lazily now. +* `datasets.qrandom` output now has a more accurate dtype of `np.uint16`. +* Uses `importlib.resources.files` instead of deprecated `pkg_resources.resource_stream`. +* Applies Ruff formatting throughout the codebase and addresses all linting errors. ### Fixed From 9ecd7fe1b119fe1dd47f47a0fe2932fafcea469d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 16 Jun 2025 17:16:19 +0200 Subject: [PATCH 09/36] fixes all linting errors for lyap_r --- nolds/measures.py | 379 ++++++++++++++++++++++++++++------------------ pyproject.toml | 6 +- 2 files changed, 234 insertions(+), 151 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 417aecf..389915c 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -1,18 +1,57 @@ +"""Main module containing all measures implemented in nolds.""" + +from __future__ import annotations + import math import warnings +from pathlib import Path +from typing import Literal, TypeVar, cast, overload import numpy as np +D = TypeVar("D", bound=np.integer | np.floating) -def rowwise_chebyshev(x, y): + +def rowwise_chebyshev( + x: np.ndarray[tuple[int, int], np.dtype[D]], y: np.ndarray[tuple[int], np.dtype[D]] +) -> np.ndarray[tuple[int], np.dtype[D]]: + """Returns the Chebyshev distances between each row of matrix x and the reference row y.""" return np.max(np.abs(x - y), axis=1) -def rowwise_euclidean(x, y): +def rowwise_euclidean( + x: np.ndarray[tuple[int, int], np.dtype[D]], y: np.ndarray[tuple[int], np.dtype[D]] +) -> np.ndarray[tuple[int], np.dtype[D]]: + """Returns the Euclidean distances between each row of matrix x and the reference row y.""" return np.sqrt(np.sum((x - y) ** 2, axis=1)) -def poly_fit(x, y, degree, fit="RANSAC"): +FittingMethod = Literal["RANSAC", "poly"] + + +def poly_fit( + x: np.ndarray[tuple[int], np.dtype[D]], + y: np.ndarray[tuple[int], np.dtype[D]], + degree: int, + fit: FittingMethod = "RANSAC", +) -> np.ndarray[tuple[int], np.dtype[np.float32]]: + """Fits a polynomial of the given degree to the data. + + This currently supports two fittting algorithms. + + - "poly" uses the standard `np.ployfit` function to perform a least squares fit. + - "RANSAC" uses the RANSAC algorithm, which is more robust to outliers but + introuces inaccuracies due to randomness. + + If "RANSAC" is chosen, but scikit-learn is not installed, "poly" is used as + a fallback option. + + Args: + x: x-axis values + y: y-axis values + degree: degree of the polynomial + fit: algorithm to use for fitting + """ # check if we can use RANSAC if fit == "RANSAC": try: @@ -30,7 +69,7 @@ def poly_fit(x, y, degree, fit="RANSAC"): fit = "poly" if fit == "poly": - return np.polyfit(x, y, degree) + return np.polyfit(x, y, degree).astype(np.float32) if fit == "RANSAC": model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) xdat = np.asarray(x) @@ -41,7 +80,7 @@ def poly_fit(x, y, degree, fit="RANSAC"): polydat = skpre.PolynomialFeatures(degree).fit_transform(xdat) try: model.fit(polydat, y) - coef = model.estimator_.coef_[::-1] + coef = cast("sklin.LinearRegression", model.estimator_).coef_[::-1] except ValueError: warnings.warn( "RANSAC did not reach consensus, using numpy's polyfit", @@ -49,30 +88,27 @@ def poly_fit(x, y, degree, fit="RANSAC"): stacklevel=2, ) coef = np.polyfit(x, y, degree) - return coef + return coef.astype(np.float32) msg = f"invalid fitting mode ({fit})" raise ValueError(msg) -def delay_embedding(data, emb_dim, lag=1): +def delay_embedding( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, emb_dim: int, lag: int = 1 +) -> np.ndarray[tuple[int, int], np.dtype[np.float32]]: """Perform a time-delay embedding of a time series. Args: - data (array-like): - the data that should be embedded - emb_dim (int): - the embedding dimension - Kwargs: - lag (int): - the lag between elements in the embedded vectors + data: the data that should be embedded + emb_dim: the embedding dimension + lag: the lag between elements in the embedded vectors Returns: - emb_dim x m array: - matrix of embedded vectors of the form + Matrix of shape (m, emb_dim) containing embedded vectors of the form [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) """ - data = np.asarray(data) + data = np.asarray(data, dtype=np.float32) min_len = (emb_dim - 1) * lag + 1 if len(data) < min_len: msg = ( @@ -86,162 +122,204 @@ def delay_embedding(data, emb_dim, lag=1): return data[indices] -def lyap_r_len(**kwargs): - """Helper function that calculates the minimum number of data points required - to use lyap_r. +def lyap_r_len(emb_dim: int, lag: int, trajectory_len: int, min_tsep: int) -> int: + """Calculates the minimum number of data points required to use lyap_r. Note that none of the required parameters may be set to None. - Kwargs: - kwargs(dict): - arguments used for lyap_r (required: emb_dim, lag, trajectory_len and - min_tsep) + Args: + emb_dim: embedding dimension for delay embedding + lag: lag for delay embedding + min_tsep: minimal temporal separation (in number of data points) between two "neighbors" + trajectory_len: the time (in number of data points) to follow the distance + trajectories between two neighboring points + Returns: - minimum number of data points required to call lyap_r with the given - parameters + minimum number of data points required to call lyap_r with the given + parameters """ # minimum length required to find single orbit vector - min_len = (kwargs["emb_dim"] - 1) * kwargs["lag"] + 1 + min_len = (emb_dim - 1) * lag + 1 # we need trajectory_len orbit vectors to follow a complete trajectory - min_len += kwargs["trajectory_len"] - 1 + min_len += trajectory_len - 1 # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs["min_tsep"] * 2 + 1 + min_len += min_tsep * 2 + 1 return min_len +@overload def lyap_r( - data, - emb_dim=10, - lag=None, - min_tsep=None, - tau=1, - min_neighbors=20, - trajectory_len=20, - fit="RANSAC", - debug_plot=False, - debug_data=False, - plot_file=None, - fit_offset=0, + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + fit_offset: int = 0, +) -> float: ... + + +@overload +def lyap_r( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + fit_offset: int = 0, +) -> tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.int32]], + np.ndarray[tuple[int], np.dtype[np.float32]], + np.ndarray[tuple[int], np.dtype[np.float32]], + ], +]: ... + + +def lyap_r( # noqa: C901, PLR0912, PLR0915 + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + emb_dim: int = 10, + *, + lag: int | None = None, + min_tsep: int | None = None, + tau: float = 1, + min_neighbors: int = 20, + trajectory_len: int = 20, + fit: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + fit_offset: int = 0, +) -> ( + float + | tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.int32]], + np.ndarray[tuple[int], np.dtype[np.float32]], + np.ndarray[tuple[int], np.dtype[np.float32]], + ], + ] ): - """Estimates the largest Lyapunov exponent using the algorithm of Rosenstein - et al. [lr_1]_. + """Estimates the largest Lyapunov exponent with the method of Rosenstein et al. [lr_1]_. Explanation of Lyapunov exponents: - See lyap_e. + See lyap_e. Explanation of the algorithm: - The algorithm of Rosenstein et al. is only able to recover the largest - Lyapunov exponent, but behaves rather robust to parameter choices. - - The idea for the algorithm relates closely to the definition of Lyapunov - exponents. First, the dynamics of the data are reconstructed using a delay - embedding method with a lag, such that each value x_i of the data is mapped - to the vector - - X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] - - For each such vector X_i, we find the closest neighbor X_j using the - euclidean distance. We know that as we follow the trajectories from X_i and - X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) - denoted as d_i(k) will increase according to a power law - d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the - highest Lyapunov exponent, because the exponential expansion along the axis - associated with this exponent will quickly dominate the expansion or - contraction along other axes. - - To calculate lambda, we look at the logarithm of the distance trajectory, - because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines - (one for each index i) whose slope is an approximation of lambda. We - therefore extract the mean log trajectory d'(k) by taking the mean of - log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to - the plot of d'(k) versus k. The slope of the line gives the desired - parameter lambda. + The algorithm of Rosenstein et al. is only able to recover the largest + Lyapunov exponent, but behaves rather robust to parameter choices. + + The idea for the algorithm relates closely to the definition of Lyapunov + exponents. First, the dynamics of the data are reconstructed using a delay + embedding method with a lag, such that each value x_i of the data is mapped + to the vector + + X_i = [x_i, x_(i+lag), x_(i+2*lag), ..., x_(i+(emb_dim-1) * lag)] + + For each such vector X_i, we find the closest neighbor X_j using the + euclidean distance. We know that as we follow the trajectories from X_i and + X_j in time in a chaotic system the distances between X_(i+k) and X_(j+k) + denoted as d_i(k) will increase according to a power law + d_i(k) = c * e^(lambda * k) where lambda is a good approximation of the + highest Lyapunov exponent, because the exponential expansion along the axis + associated with this exponent will quickly dominate the expansion or + contraction along other axes. + + To calculate lambda, we look at the logarithm of the distance trajectory, + because log(d_i(k)) = log(c) + lambda * k. This gives a set of lines + (one for each index i) whose slope is an approximation of lambda. We + therefore extract the mean log trajectory d'(k) by taking the mean of + log(d_i(k)) over all orbit vectors X_i. We then fit a straight line to + the plot of d'(k) versus k. The slope of the line gives the desired + parameter lambda. Method for choosing min_tsep: - Usually we want to find neighbors between points that are close in phase - space but not too close in time, because we want to avoid spurious - correlations between the obtained trajectories that originate from temporal - dependencies rather than the dynamic properties of the system. Therefore it - is critical to find a good value for min_tsep. One rather plausible - estimate for this value is to set min_tsep to the mean period of the - signal, which can be obtained by calculating the mean frequency using the - fast fourier transform. This procedure is used by default if the user sets - min_tsep = None. Note that this default procedure uses a naive approach - for estimating the power spectral density, which just takes the FFT of the - whole signal without applying any windowing function to avoid biases. If - you have a non-stationary input and want more than a rough estimate, - consider calculating min_tsep manually using a sliding window approach - like Welch's method (implemented in `scipy.signal.welch`). + Usually we want to find neighbors between points that are close in phase + space but not too close in time, because we want to avoid spurious + correlations between the obtained trajectories that originate from temporal + dependencies rather than the dynamic properties of the system. Therefore it + is critical to find a good value for min_tsep. One rather plausible + estimate for this value is to set min_tsep to the mean period of the + signal, which can be obtained by calculating the mean frequency using the + fast fourier transform. This procedure is used by default if the user sets + min_tsep = None. Note that this default procedure uses a naive approach + for estimating the power spectral density, which just takes the FFT of the + whole signal without applying any windowing function to avoid biases. If + you have a non-stationary input and want more than a rough estimate, + consider calculating min_tsep manually using a sliding window approach + like Welch's method (implemented in `scipy.signal.welch`). Method for choosing lag: - Another parameter that can be hard to choose by instinct alone is the lag - between individual values in a vector of the embedded orbit. Here, - Rosenstein et al. suggest to set the lag to the distance where the - autocorrelation function drops below 1 - 1/e times its original (maximal) - value. This procedure is used by default if the user sets lag = None. + Another parameter that can be hard to choose by instinct alone is the lag + between individual values in a vector of the embedded orbit. Here, + Rosenstein et al. suggest to set the lag to the distance where the + autocorrelation function drops below 1 - 1/e times its original (maximal) + value. This procedure is used by default if the user sets lag = None. References: - .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, - “A practical method for calculating largest Lyapunov exponents from - small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, - pp. 117–134, 1993. + .. [lr_1] M. T. Rosenstein, J. J. Collins, and C. J. De Luca, + “A practical method for calculating largest Lyapunov exponents from + small data sets,” Physica D: Nonlinear Phenomena, vol. 65, no. 1, + pp. 117–134, 1993. Reference Code: - .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", - url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm - .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate - Lyapunov exponent", - url: https://ideas.repec.org/c/boc/bocode/t741502.html - .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, "TISEAN 3.0.0 - Nonlinear Time Series Analysis", - url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html + .. [lr_a] mirwais, "Largest Lyapunov Exponent with Rosenstein's Algorithm", + url: http://www.mathworks.com/matlabcentral/fileexchange/38424-largest-lyapunov-exponent-with-rosenstein-s-algorithm + .. [lr_b] Shapour Mohammadi, "LYAPROSEN: MATLAB function to calculate + Lyapunov exponent", + url: https://ideas.repec.org/c/boc/bocode/t741502.html + .. [lr_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, + "TISEAN 3.0.0 - Nonlinear Time Series Analysis", + url: https://www.pks.mpg.de/tisean/Tisean_3.0.0/docs/docs_c/lyap_r.html Args: - data (iterable of float): - (one-dimensional) time series - Kwargs: - emb_dim (int): - embedding dimension for delay embedding - lag (float): - lag for delay embedding - min_tsep (float): - minimal temporal separation between two "neighbors" (default: - find a suitable value by calculating the mean period of the data) - tau (float): - step size between data points in the time series in seconds - (normalization scaling factor for exponents) - min_neighbors (int): - if lag=None, the search for a suitable lag will be stopped when the - number of potential neighbors for a vector drops below min_neighbors - trajectory_len (int): - the time (in number of data points) to follow the distance - trajectories between two neighboring points - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will - be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - fit_offset (int): - neglect the first fit_offset steps when fitting + data: (one-dimensional) time series + emb_dim: embedding dimension for delay embedding + lag: lag for delay embedding + min_tsep: minimal temporal separation between two "neighbors" (default: + find a suitable value by calculating the mean period of the data) + tau: step size between data points in the time series in seconds + (normalization scaling factor for exponents) + min_neighbors: if lag=None, the search for a suitable lag will be stopped when the + number of potential neighbors for a vector drops below min_neighbors + trajectory_len: the time (in number of data points) to follow the distance + trajectories between two neighboring points + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will + be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + fit_offset: neglect the first fit_offset steps when fitting Returns: - float: - an estimate of the largest Lyapunov exponent (a positive exponent is - a strong indicator for chaos) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(ks, div_traj, poly)`` where ``ks`` are the x-values of the line fit, - ``div_traj`` are the y-values and ``poly`` are the line coefficients - (``[slope, intercept]``). - + An estimate of the largest Lyapunov exponent (a positive exponent is + a strong indicator for chaos). If `debug_data = True`, the return + type is a tuple instead with the second element being another tuple + containing + + - the x-values of the line fit + - the y-values of the line fit + - the line coefficients (`[slope, intercept]`). """ # convert data to float to avoid overflow errors in rowwise_euclidean data = np.asarray(data, dtype=np.float64) @@ -279,9 +357,8 @@ def lyap_r( eps = acorr[n - 1] * (1 - 1.0 / np.e) lag = 1 - # small helper function to calculate resulting number of vectors for a - # given lag value - def nb_neighbors(lag_value): + def nb_neighbors(lag_value: int) -> int: + """Returns resulting number of vectors for a given lag value.""" min_len = lyap_r_len( emb_dim=emb_dim, lag=lag_value, @@ -337,21 +414,23 @@ def nb_neighbors(lag_value): raise ValueError(msg.format(-ntraj + 1)) if ntraj < min_traj: # not enough data points => there are rows where all values are inf - assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)) + assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), "no inf rows found" msg = ( "Not enough data points. At least {} trajectories are required " "to find a valid neighbor for each orbit vector with min_tsep={} " "but only {} could be created." ) raise ValueError(msg.format(min_traj, min_tsep, ntraj)) - assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)) + assert np.all(np.any(np.isfinite(dists[:ntraj, :ntraj]), axis=1)), ( + "some distances are not finite" + ) # find nearest neighbors (exclude last columns, because these vectors cannot # be followed in time for trajectory_len steps) nb_idx = np.argmin(dists[:ntraj, :ntraj], axis=1) # build divergence trajectory by averaging distances along the trajectory # over all neighbor pairs - div_traj = np.zeros(trajectory_len, dtype=float) + div_traj = np.zeros(trajectory_len, dtype=np.float32) for k in range(trajectory_len): # calculate mean trajectory distance at step k indices = (np.arange(ntraj) + k, nb_idx + k) @@ -371,7 +450,7 @@ def nb_neighbors(lag_value): if len(ks) < 1: # if all points or all but one point in the trajectory is -inf, we cannot # fit a line through the remaining points => return -inf as exponent - poly = [-np.inf, 0] + poly = np.array([-np.inf, 0], dtype=np.float32) else: # normal line fitting poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) diff --git a/pyproject.toml b/pyproject.toml index 6a08c62..7941338 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -85,8 +85,12 @@ ignore = [ "TD003", # once there is an issue, there is no need to keep the TODO => don't require links "COM812", # missing trailing commas are already fixed by the formatter "PLR0913", # this is a scientific library, functions just have a bazillion parameters - "RUF002", # en-dashes are used for bibliographical references in docstrings + "S101", # nolds uses assers as legitimate checks for programming (not user) errors ] +allowed-confusables = [ + "–" # en-dashes are used for bibliographical references in docstrings +] + [tool.ruff.lint.pydocstyle] convention = "google" From 63cba9f626952fca51109970ed90325f99b7b083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 16 Jun 2025 17:33:46 +0200 Subject: [PATCH 10/36] fixes tests by defaulting to float64 for precision --- nolds/measures.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 389915c..6e55659 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -34,7 +34,7 @@ def poly_fit( y: np.ndarray[tuple[int], np.dtype[D]], degree: int, fit: FittingMethod = "RANSAC", -) -> np.ndarray[tuple[int], np.dtype[np.float32]]: +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Fits a polynomial of the given degree to the data. This currently supports two fittting algorithms. @@ -69,7 +69,7 @@ def poly_fit( fit = "poly" if fit == "poly": - return np.polyfit(x, y, degree).astype(np.float32) + return np.polyfit(x, y, degree) if fit == "RANSAC": model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) xdat = np.asarray(x) @@ -88,14 +88,14 @@ def poly_fit( stacklevel=2, ) coef = np.polyfit(x, y, degree) - return coef.astype(np.float32) + return coef msg = f"invalid fitting mode ({fit})" raise ValueError(msg) def delay_embedding( data: np.typing.FloatArrayLike | np.typing.IntArrayLike, emb_dim: int, lag: int = 1 -) -> np.ndarray[tuple[int, int], np.dtype[np.float32]]: +) -> np.ndarray[tuple[int, int], np.dtype[np.float64]]: """Perform a time-delay embedding of a time series. Args: @@ -108,7 +108,8 @@ def delay_embedding( [data[i], data[i+lag], data[i+2*lag], ... data[i+(emb_dim-1)*lag]] for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) """ - data = np.asarray(data, dtype=np.float32) + if not isinstance(data, np.ndarray): + data = np.asarray(data, dtype=np.float64) min_len = (emb_dim - 1) * lag + 1 if len(data) < min_len: msg = ( @@ -163,7 +164,7 @@ def lyap_r( debug_data: Literal[False] = False, plot_file: str | Path | None = None, fit_offset: int = 0, -) -> float: ... +) -> np.float64: ... @overload @@ -182,11 +183,11 @@ def lyap_r( plot_file: str | Path | None = None, fit_offset: int = 0, ) -> tuple[ - float, + np.float64, tuple[ np.ndarray[tuple[int], np.dtype[np.int32]], - np.ndarray[tuple[int], np.dtype[np.float32]], - np.ndarray[tuple[int], np.dtype[np.float32]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], ], ]: ... @@ -206,13 +207,13 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 plot_file: str | Path | None = None, fit_offset: int = 0, ) -> ( - float + np.float64 | tuple[ float, tuple[ np.ndarray[tuple[int], np.dtype[np.int32]], - np.ndarray[tuple[int], np.dtype[np.float32]], - np.ndarray[tuple[int], np.dtype[np.float32]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], ], ] ): @@ -430,7 +431,7 @@ def nb_neighbors(lag_value: int) -> int: # build divergence trajectory by averaging distances along the trajectory # over all neighbor pairs - div_traj = np.zeros(trajectory_len, dtype=np.float32) + div_traj = np.zeros(trajectory_len, dtype=np.float64) for k in range(trajectory_len): # calculate mean trajectory distance at step k indices = (np.arange(ntraj) + k, nb_idx + k) @@ -450,7 +451,7 @@ def nb_neighbors(lag_value: int) -> int: if len(ks) < 1: # if all points or all but one point in the trajectory is -inf, we cannot # fit a line through the remaining points => return -inf as exponent - poly = np.array([-np.inf, 0], dtype=np.float32) + poly = np.array([-np.inf, 0], dtype=np.float64) else: # normal line fitting poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) From 22863baa46332ecebd55133372dfad5793778bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Thu, 19 Jun 2025 17:05:34 +0200 Subject: [PATCH 11/36] fixes linting errors in lyap_e_len --- nolds/measures.py | 53 +++++++++++++++++++---------------------------- 1 file changed, 21 insertions(+), 32 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 6e55659..33481aa 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -463,35 +463,32 @@ def nb_neighbors(lag_value: int) -> int: return le -def lyap_e_len(**kwargs): - """Helper function that calculates the minimum number of data points required - to use lyap_e. +def lyap_e_len(emb_dim: int, matrix_dim: int, min_tsep: int, min_nb: int) -> int: + """Returns the minimum number of data points required to use lyap_e. Note that none of the required parameters may be set to None. - Kwargs: - kwargs(dict): - arguments used for lyap_e (required: emb_dim, matrix_dim, min_nb - and min_tsep) - - Returns: - minimum number of data points required to call lyap_e with the given - parameters + Args: + matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb: minimal number of neighbors + (default: min(2 * matrix_dim, matrix_dim + 4)) + min_tsep: minimal temporal separation between two "neighbors" + emb_dim: embedding dimension """ - m = (kwargs["emb_dim"] - 1) // (kwargs["matrix_dim"] - 1) + m = (emb_dim - 1) // (matrix_dim - 1) # minimum length required to find single orbit vector - min_len = kwargs["emb_dim"] + min_len = emb_dim # we need to follow each starting point of an orbit vector for m more steps min_len += m # we need min_tsep * 2 + 1 orbit vectors to find neighbors for each - min_len += kwargs["min_tsep"] * 2 + min_len += min_tsep * 2 # we need at least min_nb neighbors for each orbit vector - min_len += kwargs["min_nb"] + min_len += min_nb return min_len def lyap_e( - data, + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, emb_dim=10, matrix_dim=4, min_nb=None, @@ -573,24 +570,16 @@ def lyap_e( (scalar) data points Kwargs: - emb_dim (int): - embedding dimension - matrix_dim (int): - matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) - min_nb (int): - minimal number of neighbors + emb_dim: embedding dimension + matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb: minimal number of neighbors (default: min(2 * matrix_dim, matrix_dim + 4)) - min_tsep (int): - minimal temporal separation between two "neighbors" - tau (float): - step size of the data in seconds + min_tsep: minimal temporal separation between two "neighbors" + tau: step size of the data in seconds (normalization scaling factor for exponents) - debug_plot (boolean): - if True, a histogram matrix of the individual estimates will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved + debug_plot: if True, a histogram matrix of the individual estimates will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved under the given file name instead of directly showing it through ``plt.show()`` From 5eeaeee423819634f8fa9f32aaeb94d6a0a8867b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Thu, 19 Jun 2025 20:54:49 +0200 Subject: [PATCH 12/36] fixes linting errors for sample entropy --- nolds/measures.py | 432 ++++++++++++++++++++++++++++------------------ pyproject.toml | 25 ++- 2 files changed, 278 insertions(+), 179 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 33481aa..74933d8 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -5,7 +5,7 @@ import math import warnings from pathlib import Path -from typing import Literal, TypeVar, cast, overload +from typing import Callable, Literal, TypeVar, cast, overload import numpy as np @@ -486,111 +486,143 @@ def lyap_e_len(emb_dim: int, matrix_dim: int, min_tsep: int, min_nb: int) -> int min_len += min_nb return min_len +@overload +def lyap_e( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: ... + +@overload def lyap_e( data: np.typing.FloatArrayLike | np.typing.IntArrayLike, - emb_dim=10, - matrix_dim=4, - min_nb=None, - min_tsep=0, - tau=1, - debug_plot=False, - debug_data=False, - plot_file=None, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int, int], np.dtype[np.float64]] +]: ... + + +def lyap_e( # noqa: C901, PLR0915 + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + *, + emb_dim: int = 10, + matrix_dim: int = 4, + min_nb: int | None = None, + min_tsep: int = 0, + tau: float = 1, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + np.ndarray[tuple[int], np.dtype[np.float64]] + | tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ] ): - r"""Estimates the Lyapunov exponents for the given data using the algorithm of - Eckmann et al. [le_1]_. + r"""Estimates the Lyapunov exponents using the algorithm of Eckmann et al. [le_1]_. Recommendations for parameter settings by Eckmann et al.: - * long recording time improves accuracy, small tau does not - * use large values for emb_dim - * matrix_dim should be 'somewhat larger than the expected number of - positive Lyapunov exponents' - * min_nb = min(2 * matrix_dim, matrix_dim + 4) + * long recording time improves accuracy, small tau does not + * use large values for emb_dim + * matrix_dim should be 'somewhat larger than the expected number of + positive Lyapunov exponents' + * min_nb = min(2 * matrix_dim, matrix_dim + 4) Explanation of Lyapunov exponents: - The Lyapunov exponent describes the rate of separation of two - infinitesimally close trajectories of a dynamical system in phase space. - In a chaotic system, these trajectories diverge exponentially following - the equation: + The Lyapunov exponent describes the rate of separation of two + infinitesimally close trajectories of a dynamical system in phase space. + In a chaotic system, these trajectories diverge exponentially following + the equation: - \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| + \|X(t, X_0) - X(t, X_0 + eps)| = e^(lambda * t) * \|eps| - In this equation X(t, X_0) is the trajectory of the system X starting at - the point X_0 in phase space at time t. eps is the (infinitesimal) - difference vector and lambda is called the Lyapunov exponent. If the - system has more than one free variable, the phase space is - multidimensional and each dimension has its own Lyapunov exponent. The - existence of at least one positive Lyapunov exponent is generally seen as - a strong indicator for chaos. + In this equation X(t, X_0) is the trajectory of the system X starting at + the point X_0 in phase space at time t. eps is the (infinitesimal) + difference vector and lambda is called the Lyapunov exponent. If the + system has more than one free variable, the phase space is + multidimensional and each dimension has its own Lyapunov exponent. The + existence of at least one positive Lyapunov exponent is generally seen as + a strong indicator for chaos. Explanation of the Algorithm: - To calculate the Lyapunov exponents analytically, the Jacobian of the - system is required. The algorithm of Eckmann et al. therefore tries to - estimate this Jacobian by reconstructing the dynamics of the system from - which the time series was obtained. For this, several steps are required: - - * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim - dimensions (map each point x_i of the time series to a vector - [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). - * For each vector X_i in this orbit find a radius r_i so that at least - min_nb other vectors lie within (chebyshev-)distance r_i around X_i. - These vectors will be called "neighbors" of X_i. - * Find the Matrix T_i that sends points from the neighborhood of X_i to - the neighborhood of X_(i+1). To avoid undetermined values in T_i, we - construct T_i not with size (emb_dim x emb_dim) but with size - (matrix_dim x matrix_dim), so that we have a larger "step size" m in the - X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), - ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible - by matrix_dim-1. The T_i are then found by a linear least squares fit, - assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the - neighborhood of X_i. - * Starting with i = 1 and Q_0 = identity successively decompose the matrix - T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. - * Calculate the Lyapunov exponents from the mean of the logarithm of the - diagonal elements of the matrices R_i. To normalize the Lyapunov - exponents, they have to be divided by m and by the step size tau of the - original time series. + To calculate the Lyapunov exponents analytically, the Jacobian of the + system is required. The algorithm of Eckmann et al. therefore tries to + estimate this Jacobian by reconstructing the dynamics of the system from + which the time series was obtained. For this, several steps are required: + + * Embed the time series [x_1, x_2, ..., x_(N-1)] in an orbit of emb_dim + dimensions (map each point x_i of the time series to a vector + [x_i, x_(i+1), x_(i+2), ... x_(i+emb_dim-1)]). + * For each vector X_i in this orbit find a radius r_i so that at least + min_nb other vectors lie within (chebyshev-)distance r_i around X_i. + These vectors will be called "neighbors" of X_i. + * Find the Matrix T_i that sends points from the neighborhood of X_i to + the neighborhood of X_(i+1). To avoid undetermined values in T_i, we + construct T_i not with size (emb_dim x emb_dim) but with size + (matrix_dim x matrix_dim), so that we have a larger "step size" m in the + X_i, which are now defined as X'_i = [x_i, x_(i+m), x_(i+2m), + ... x_(i+(matrix_dim-1)*m)]. This means that emb_dim-1 must be divisible + by matrix_dim-1. The T_i are then found by a linear least squares fit, + assuring that T_i (X_j - X_i) ~= X_(j+m) - X_(i+m) for any X_j in the + neighborhood of X_i. + * Starting with i = 1 and Q_0 = identity successively decompose the matrix + T_i * Q_(i-1) into the matrices Q_i and R_i by a QR-decomposition. + * Calculate the Lyapunov exponents from the mean of the logarithm of the + diagonal elements of the matrices R_i. To normalize the Lyapunov + exponents, they have to be divided by m and by the step size tau of the + original time series. References: - .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, - “Liapunov exponents from time series,” Physical Review A, - vol. 34, no. 6, pp. 4971–4979, 1986. + .. [le_1] J. P. Eckmann, S. O. Kamphorst, D. Ruelle, and S. Ciliberto, + “Liapunov exponents from time series,” Physical Review A, + vol. 34, no. 6, pp. 4971–4979, 1986. Reference code: - .. [le_a] Manfred Füllsack, "Lyapunov exponent", - url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), - url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m - .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, - url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html + .. [le_a] Manfred Füllsack, "Lyapunov exponent", + url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html + .. [le_b] Steve SIU, Lyapunov Exponents Toolbox (LET), + url: http://www.mathworks.com/matlabcentral/fileexchange/233-let/content/LET/findlyap.m + .. [le_c] Rainer Hegger, Holger Kantz, and Thomas Schreiber, TISEAN, + url: http://www.mpipks-dresden.mpg.de/~tisean/Tisean_3.0.1/index.html Args: - data (array-like of float): - (scalar) data points - - Kwargs: - emb_dim: embedding dimension - matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) - min_nb: minimal number of neighbors - (default: min(2 * matrix_dim, matrix_dim + 4)) - min_tsep: minimal temporal separation between two "neighbors" - tau: step size of the data in seconds - (normalization scaling factor for exponents) - debug_plot: if True, a histogram matrix of the individual estimates will be shown - debug_data: if True, debugging data will be returned alongside the result - plot_file: if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: (scalar) data points + emb_dim: embedding dimension + matrix_dim: matrix dimension (emb_dim - 1 must be divisible by matrix_dim - 1) + min_nb: minimal number of neighbors + (default: min(2 * matrix_dim, matrix_dim + 4)) + min_tsep: minimal temporal separation between two "neighbors" + tau: step size of the data in seconds + (normalization scaling factor for exponents) + debug_plot: if True, a histogram matrix of the individual estimates will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - float array: - array of matrix_dim Lyapunov exponents (positive exponents are indicators - for chaos) - 2d-array of floats: - only present if debug_data is True: all estimates for the matrix_dim - Lyapunov exponents from the x iterations of R_i. The shape of this debug - data is (x, matrix_dim). + Array of matrix_dim Lyapunov exponents (positive exponents are indicators + for chaos). If `debug_data = True`, the return type is a tuple instead + with the first element being the Lyapunov exponents and the second element + being all estimates for the matrix_dim Lyapunov exponents from the x + iterations of R_i. The shape of this debug data is (x, matrix_dim). """ # convert to float to avoid errors when using 'inf' as distance data = np.asarray(data, dtype=np.float64) @@ -641,8 +673,6 @@ def lyap_e( lexp = np.zeros(matrix_dim, dtype=np.float64) lexp_counts = np.zeros(lexp.shape) debug_values = [] - # TODO reduce number of points to visit? - # TODO performance test! for i in range(len(orbit)): # find neighbors for each vector in the orbit using the chebyshev distance diffs = rowwise_chebyshev(orbit, orbit[i]) @@ -666,7 +696,7 @@ def lyap_e( # there may be more than min_nb vectors at distance r (if multiple vectors # have a distance of exactly r) # => update index accordingly - indices = np.where(diffs <= r)[0] + indices = (diffs <= r).nonzero()[0] # find the matrix T_i that satisifies # T_i (orbit'[j] - orbit'[i]) = (orbit'[j+m] - orbit'[i+m]) @@ -698,7 +728,7 @@ def lyap_e( # x_j2 - x_i x_j2+m - x_i+m ... x_j2+(d_M-1)m - x_i+(d_M-1)m # ... - # note: emb_dim = (d_M - 1) * m + 1 + # note: emb_dim = (d_M - 1) * m + 1 # noqa: ERA001 mat_X = np.array([data[j : j + emb_dim : m] for j in indices]) mat_X -= data[i : i + emb_dim : m] @@ -706,7 +736,7 @@ def lyap_e( # x_j1+(d_M)m - x_i+(d_M)m # x_j2+(d_M)m - x_i+(d_M)m # ... - if max(np.max(indices), i) + matrix_dim * m >= len(data): + if max(int(np.max(indices)), i) + matrix_dim * m >= len(data): assert len(data) < min_len msg = ( "Not enough data points. Cannot follow orbit vector {} for " @@ -768,7 +798,23 @@ def lyap_e( return lexp -def plot_dists(dists, tolerance, m, title=None, fname=None) -> None: +def plot_dists( + dists: list[np.ndarray[tuple[int], np.dtype[np.float64]]], + tolerance: float, + m: int, + title: str | None = None, + fname: str | Path | None = None, +) -> None: + """Plots a histogram per distance array in dists. + + Args: + dists: Distance arrays for which to plot the histograms. + tolerance: Tolerance value for the distance (will be highlighted). + m: Embedding dimension (used for labeling the histograms). + title: Title for the plot (optional). + fname: If not None, the plot will be saved under this file name instead of + showing it directly with ``plt.show()``. + """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt @@ -778,13 +824,11 @@ def plot_dists(dists, tolerance, m, title=None, fname=None) -> None: ymax = len(dists_full) * 0.05 mean = np.mean(dists_full) std = np.std(dists_full, ddof=1) - rng = (0, mean + std * nstd) - i = 0 + rng = (0.0, float(mean + std * nstd)) colors = ["green", "blue"] - for h, bins in [np.histogram(dat, nbins, rng) for dat in dists]: + for i, (h, bins) in enumerate([np.histogram(dat, bins=nbins, range=rng) for dat in dists]): bw = bins[1] - bins[0] plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) - i += 1 plt.axvline(tolerance, color="red") plt.legend(loc="best") plt.xlabel("distance") @@ -799,87 +843,143 @@ def plot_dists(dists, tolerance, m, title=None, fname=None) -> None: plt.close() +@overload def sampen( - data, - emb_dim=2, - tolerance=None, - lag=1, - dist=rowwise_chebyshev, - closed=False, - debug_plot=False, - debug_data=False, - plot_file=None, + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> float: ... + + +@overload +def sampen( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: Literal[True], + plot_file: str | Path | None = None, +) -> tuple[ + float, + list[float], + list[np.ndarray[tuple[int], np.dtype[np.float64]]], +]: ... + + +def sampen( # noqa: C901, PLR0912 + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + *, + emb_dim: int = 2, + tolerance: float | None = None, + lag: int = 1, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_chebyshev, + closed: bool = False, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + float + | tuple[ + float, + list[float], + list[np.ndarray[tuple[int], np.dtype[np.float64]]], + ] ): """Computes the sample entropy of the given data. Explanation of the sample entropy: - The sample entropy of a time series is defined as the negative natural - logarithm of the conditional probability that two sequences similar for - emb_dim points remain similar at the next point, excluding self-matches. + The sample entropy of a time series is defined as the negative natural + logarithm of the conditional probability that two sequences similar for + emb_dim points remain similar at the next point, excluding self-matches. - A lower value for the sample entropy therefore corresponds to a higher - probability indicating more self-similarity. + A lower value for the sample entropy therefore corresponds to a higher + probability indicating more self-similarity. Explanation of the algorithm: - The algorithm constructs all subsequences of length emb_dim - [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j - where dist(s_i, s_j) < tolerance. The same process is repeated for all - subsequences of length emb_dim + 1. The sum of similar sequence pairs - with length emb_dim + 1 is divided by the sum of similar sequence pairs - with length emb_dim. The result of the algorithm is the negative logarithm - of this ratio/probability. + The algorithm constructs all subsequences of length emb_dim + [s_1, s_1+lag, s_1+2*lag, ...] and then counts each pair (s_i, s_j) with i != j + where dist(s_i, s_j) < tolerance. The same process is repeated for all + subsequences of length emb_dim + 1. The sum of similar sequence pairs + with length emb_dim + 1 is divided by the sum of similar sequence pairs + with length emb_dim. The result of the algorithm is the negative logarithm + of this ratio/probability. References: - .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series - analysis using approximate entropy and sample entropy,” - American Journal of Physiology-Heart and Circulatory Physiology, - vol. 278, no. 6, pp. H2039–H2049, 2000. + .. [se_1] J. S. Richman and J. R. Moorman, “Physiological time-series + analysis using approximate entropy and sample entropy,” + American Journal of Physiology-Heart and Circulatory Physiology, + vol. 278, no. 6, pp. H2039–H2049, 2000. Reference code: - .. [se_a] "sample_entropy" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + .. [se_a] "sample_entropy" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf Args: - data (array-like of float): - input data - - Kwargs: - emb_dim (int): - the embedding dimension (length of vectors to compare) - tolerance (float): - distance threshold for two template vectors to be considered equal - (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect - for other values of emb_dim) - lag (int): - delay for the delay embedding - dist (function (2d-array, 1d-array) -> 1d-array): - distance function used to calculate the distance between template - vectors. Sampen is defined using ``rowwise_chebyshev``. You should only - use something else, if you are sure that you need it. - closed (boolean): - if True, will check for vector pairs whose distance is in the closed - interval [0, r] (less or equal to r), otherwise the open interval - [0, r) (less than r) will be used - debug_plot (boolean): - if True, a histogram of the individual distances for m and m+1 - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: input data + emb_dim (int): + the embedding dimension (length of vectors to compare) + tolerance (float): + distance threshold for two template vectors to be considered equal + (default: 0.2 * std(data) at emb_dim = 2, corrected for dimension effect + for other values of emb_dim) + lag (int): + delay for the delay embedding + dist (function (2d-array, 1d-array) -> 1d-array): + distance function used to calculate the distance between template + vectors. Sampen is defined using ``rowwise_chebyshev``. You should only + use something else, if you are sure that you need it. + closed (boolean): + if True, will check for vector pairs whose distance is in the closed + interval [0, r] (less or equal to r), otherwise the open interval + [0, r) (less than r) will be used + debug_plot (boolean): + if True, a histogram of the individual distances for m and m+1 + debug_data (boolean): + if True, debugging data will be returned alongside the result + plot_file (str): + if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - float: - the sample entropy of the data (negative logarithm of ratio between - similar template vectors of length emb_dim + 1 and emb_dim) - [c_m, c_m1]: - list of two floats: count of similar template vectors of length emb_dim - (c_m) and of length emb_dim + 1 (c_m1) - [float list, float list]: - Lists of lists of the form ``[dists_m, dists_m1]`` containing the - distances between template vectors for m (dists_m) - and for m + 1 (dists_m1). + The sample entropy of the data (negative logarithm of ratio between + similar template vectors of length emb_dim + 1 and emb_dim). If + `debug_data` is True, the return type is a tuple instead, containing + + - sampen: the sample entropy + - [c_m, c_m1]: list of two floats: count of similar template vectors of + length emb_dim (c_m) and of length emb_dim + 1 (c_m1) + - [dists_m, dists_m1]: the distances between template vectors for m + (dists_m) and for m + 1 (dists_m1). """ data = np.asarray(data) @@ -891,7 +991,7 @@ def sampen( # the chebyshev distance of vectors sampled from a univariate normal # distribution # 4. 0.1164 is used as a factor to ensure that tolerance == std * 0.2 for - # emb_dim == 2 + # emb_dim == 2 # noqa: ERA001 tolerance = np.std(data, ddof=1) * 0.1164 * (0.5627 * np.log(emb_dim) + 1.3334) n = len(data) @@ -925,9 +1025,9 @@ def sampen( plot_data[-1].extend(dsts) # count how many distances are smaller than the tolerance if closed: - counts[-1] += np.sum(dsts <= tolerance) + counts[-1] += np.sum(dsts <= cast("float", tolerance)) else: - counts[-1] += np.sum(dsts < tolerance) + counts[-1] += np.sum(dsts < cast("float", tolerance)) if counts[0] > 0 and counts[1] > 0: saen = -np.log(1.0 * counts[1] / counts[0]) else: @@ -952,7 +1052,9 @@ def sampen( else: saen = np.inf if debug_plot: - plot_dists(plot_data, tolerance, m, title=f"sampEn = {saen:.3f}", fname=plot_file) + plot_dists( + plot_data, cast("float", tolerance), m, title=f"sampEn = {saen:.3f}", fname=plot_file + ) if debug_data: return (saen, counts, plot_data) return saen diff --git a/pyproject.toml b/pyproject.toml index 7941338..2bbdd6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "nolds" version = "0.6.2" description = "Nonlinear measures for dynamical systems (based on one-dimensional time series)" authors = [ - {name= "Christopher Schölzel", email= "christopher.schoelzel@mailbox.org"} + { name = "Christopher Schölzel", email = "christopher.schoelzel@mailbox.org" }, ] license = "MIT" license-files = ["LICENSE.txt"] @@ -18,7 +18,7 @@ keywords = [ "DFA", "detrended fluctuation analysis", "sample entropy", - "correlation dimension" + "correlation dimension", ] classifiers = [ "Development Status :: 5 - Production/Stable", @@ -31,15 +31,11 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12" + "Programming Language :: Python :: 3.12", ] readme = "README.rst" requires-python = ">=3.8" -dependencies = [ - "numpy>1.0,<3.0", - "future>=1.0", - "setuptools>=72.1.0" -] +dependencies = ["numpy>1.0,<3.0", "future>=1.0", "setuptools>=72.1.0"] [project.optional-dependencies] RANSAC = ["scikit-learn>=0.19"] @@ -64,7 +60,7 @@ build-backend = "hatchling.build" [tool.ruff] -line-length = 100 # allow slightly longer lines +line-length = 100 # allow slightly longer lines indent-width = 4 # Assume Python 3.8 @@ -81,14 +77,15 @@ indent-style = "space" # Select all rules by default select = ["ALL"] ignore = [ - "TD002", # adding an author to TODOs wastes space and is redundant because of git history - "TD003", # once there is an issue, there is no need to keep the TODO => don't require links - "COM812", # missing trailing commas are already fixed by the formatter + "TD002", # adding an author to TODOs wastes space and is redundant because of git history + "TD003", # once there is an issue, there is no need to keep the TODO => don't require links + "COM812", # missing trailing commas are already fixed by the formatter "PLR0913", # this is a scientific library, functions just have a bazillion parameters - "S101", # nolds uses assers as legitimate checks for programming (not user) errors + "S101", # nolds uses assers as legitimate checks for programming (not user) errors + "N806", # some variables are kept close to the papers, which means they will have uppercase letters in them ] allowed-confusables = [ - "–" # en-dashes are used for bibliographical references in docstrings + "–", # en-dashes are used for bibliographical references in docstrings ] From 279d2656ddb51936e9d5e66fa0850ba286aa011a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 22 Jun 2025 14:01:09 +0200 Subject: [PATCH 13/36] fixes type errors in a few more helper functions --- nolds/measures.py | 198 +++++++++++++++++++++++----------------------- pyproject.toml | 1 + 2 files changed, 102 insertions(+), 97 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 74933d8..047d8bb 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -486,6 +486,7 @@ def lyap_e_len(emb_dim: int, matrix_dim: int, min_tsep: int, min_nb: int) -> int min_len += min_nb return min_len + @overload def lyap_e( data: np.typing.FloatArrayLike | np.typing.IntArrayLike, @@ -1041,7 +1042,7 @@ def sampen( # noqa: C901, PLR0912 ( "Zero vectors are within tolerance for {}. " "Consider raising the tolerance parameter to avoid {} result." - ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), + ).format(" and ".join(zcounts), "NaN" if len(zcounts) == 2 else "inf"), # noqa: PLR2004 RuntimeWarning, stacklevel=2, ) @@ -1060,49 +1061,48 @@ def sampen( # noqa: C901, PLR0912 return saen -def binary_n(total_N, min_n=50): - """Creates a list of values by successively halving the total length total_N - until the resulting value is less than min_n. +def binary_n(total_N: int, min_n: int = 50) -> list[int]: + """Creates a list of values by successively halving the total length total_N. - Non-integer results are rounded down. + The iteration stops when the resulting value is less than min_n. Non-integer + results are rounded down. Args: - total_N (int): - total length - Kwargs: - min_n (int): - minimal length after division + total_N: total length + min_n: minimal length after division Returns: - list of integers: - total_N/2, total_N/4, total_N/8, ... until total_N/2^i < min_n + [total_N/2, total_N/4, total_N/8, ...] until total_N/2^i < min_n """ max_exp = np.log2(1.0 * total_N / min_n) max_exp = int(np.floor(max_exp)) return [int(np.floor(1.0 * total_N / (2**i))) for i in range(1, max_exp + 1)] -def logarithmic_n(min_n, max_n, factor): - """Creates a list of values by successively multiplying a minimum value min_n by - a factor > 1 until a maximum value max_n is reached. +def logarithmic_n(min_n: int, max_n: int, factor: float) -> list[int]: + """Creates a list of window sizes that are equidistant on a logarithmic scale. + + The values are calculated by multiplying a minimum value min_n by a factor > 1 + until a maximum value max_n is reached. Non-integer results are rounded down. Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) + min_n: minimum value (must be < max_n) + max_n: maximum value (must be > min_n) + factor: factor used to increase min_n (must be > 1) Returns: - list of integers: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n - without duplicates + [min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i] where + all values are < max_n. Duplicates (due to step sizes less than 1) are + discarded. """ - assert max_n > min_n - assert factor > 1 + if max_n <= min_n: + msg = f"max_n must be larger than min_n ({max_n} <= {min_n})." + raise ValueError(msg) + if factor <= 1: + msg = f"Factor must be larger than 1, but got {factor}." + raise ValueError(msg) # stop condition: min * f^x = max # => f^x = max/min # => x = log(max/min) / log(f) @@ -1115,134 +1115,121 @@ def logarithmic_n(min_n, max_n, factor): return ns -def logmid_n(max_n, ratio=1 / 4.0, nsteps=15): - """Creates an array of integers that lie evenly spaced in the "middle" of the - logarithmic scale from 0 to log(max_n). +def logmid_n( + max_n: int, ratio: float = 1 / 4.0, nsteps: int = 15 +) -> np.ndarray[tuple[int], np.dtype[np.int32]]: + """Creates an array of equidistant values in the "middle" of [0, max_n] on a logarithmic scale. If max_n is very small and/or nsteps is very large, this may lead to duplicate values which will be removed from the output. This function has benefits in hurst_rs, because it cuts away both very small and very large n, which both can cause problems, and still produces a - logarithmically spaced sequence. + sequence that is equidistant on the logarithmic scale. Args: - max_n (int): - largest possible output value (should be the sequence length when used in - hurst_rs) - - Kwargs: - ratio (float): - width of the "middle" of the logarithmic interval relative to log(max_n). - For example, for ratio=1/2.0 the logarithm of the resulting values will - lie between 0.25 * log(max_n) and 0.75 * log(max_n). - nsteps (float): - (maximum) number of values to take from the specified range + max_n: largest possible output value (should be the sequence length when + used in hurst_rs) + ratio: width of the "middle" of the logarithmic interval relative to log(max_n). + For example, for ratio=1/2.0 the logarithm of the resulting values will + lie between 0.25 * log(max_n) and 0.75 * log(max_n). + nsteps: (maximum) number of values to take from the specified range Returns: - array of int: - a logarithmically spaced sequence of at most nsteps values (may be less, - because only unique values are returned) + A logarithmically spaced sequence of at most nsteps values (may be less + because only unique values are returned). """ - l = np.log(max_n) - span = l * ratio - start = l * (1 - ratio) * 0.5 + logmax = np.log(max_n) + span = logmax * ratio + start = logmax * (1 - ratio) * 0.5 midrange = start + 1.0 * np.arange(nsteps) / nsteps * span - nvals = np.round(np.exp(midrange)).astype("int32") + nvals = np.round(np.exp(midrange)).astype(np.int32) return np.unique(nvals) -def logarithmic_r(min_n, max_n, factor): - """Creates a list of values by successively multiplying a minimum value min_n by +def logarithmic_r(min_n: int, max_n: int, factor: float) -> list[float]: + """Creates a list of real values that are equidistant on a logarithmic scale. + + The values are generated by successively multiplying a minimum value min_n by a factor > 1 until a maximum value max_n is reached. Args: - min_n (float): - minimum value (must be < max_n) - max_n (float): - maximum value (must be > min_n) - factor (float): - factor used to increase min_n (must be > 1) + min_n: minimum value (must be < max_n) + max_n: maximum value (must be > min_n) + factor: factor used to increase min_n (must be > 1) Returns: - list of floats: - min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i < max_n + [min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i] where + all values are < max_n. """ - assert max_n > min_n - assert factor > 1 + if max_n <= min_n: + msg = f"max_n must be larger than min_n ({max_n} <= {min_n})." + raise ValueError(msg) + if factor <= 1: + msg = f"Factor must be larger than 1, but got {factor}." + raise ValueError(msg) max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) return [min_n * (factor**i) for i in range(max_i + 1)] -def expected_rs(n): - """Calculates the expected (R/S)_n for white noise for a given n. +def expected_rs(n: int) -> float: + """Approximates the expected (R/S)_n for white noise for a given n. This is used as a correction factor in the function hurst_rs. It uses the formula of Anis-Lloyd-Peters (see [h_3]_). Args: - n (int): - the value of n for which the expected (R/S)_n should be calculated + n: the value of n for which the expected (R/S)_n should be calculated Returns: - float: expected (R/S)_n for white noise """ front = (n - 0.5) / n i = np.arange(1, n) back = np.sum(np.sqrt((n - i) / i)) - if n <= 340: + small = 340 # small values behave differently + if n <= small: middle = math.gamma((n - 1) * 0.5) / math.sqrt(math.pi) / math.gamma(n * 0.5) else: middle = 1.0 / math.sqrt(n * math.pi * 0.5) return front * middle * back -def expected_h(nvals, fit="RANSAC"): - """Uses expected_rs to calculate the expected value for the Hurst exponent h - based on the values of n used for the calculation. +def expected_h(nvals: np.typing.IntArrayLike, fit: FittingMethod = "RANSAC") -> float: + """Uses expected_rs to calculate the expected value for the Hurst exponent h. Args: - nvals (iterable of int): - the values of n used to calculate the individual (R/S)_n - - KWargs: - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers + nvals: The values of n used to calculate the individual (R/S)_n + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers Returns: - float: expected h for white noise """ + nvals = np.asarray(nvals, dtype=np.int32) rsvals = [expected_rs(n) for n in nvals] poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit) return poly[0] -def rs(data, n, unbiased=True): - """Calculates an individual R/S value in the rescaled range approach for - a given n. +def rs( + data: np.ndarray[tuple[int], np.dtype[np.float64]], n: int, *, unbiased: bool = True +) -> float: + """Calculates an individual R/S value in the rescaled range approach for a given n. Note: This is just a helper function for hurst_rs and should not be called directly. Args: - data (array-like of float): - time series - n (float): - size of the subseries in which data should be split - - Kwargs: - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. + data: time series + n: size of the subseries in which data should be split + unbiased: if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. Returns: - float: (R/S)_n """ data = np.asarray(data) @@ -1274,7 +1261,24 @@ def rs(data, n, unbiased=True): return np.mean(r / s) -def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None) -> None: +def plot_histogram_matrix( + data: np.ndarray[tuple[int, int], np.dtype[np.float64]], + name: str, + bin_range: Literal["absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma"] = "3sigma", + fname: str | Path | None = None, +) -> None: + """Plot a quadratic matrix of histograms. + + Args: + data: matrix of shape (N, K) where K is the number of histograms and N is the size of + a single dimension of which to take a histogram. + name: Title of the plots. + bin_range: How to determine the range of the histogram. "absmax" uses the absolute + maximum and minimum, while Xsigma cuts off values outside the X sigma range + assuming a normal distributed dataset. + fname: File name to use to store the plot. If this is not given, the plot is displayed + with show() instead. + """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt @@ -1285,19 +1289,19 @@ def plot_histogram_matrix(data, name, bin_range="3sigma", fname=None) -> None: plt.figure(figsize=(nrows * 4, nrows * 4)) for i in range(nhists): plt.subplot(nrows, nrows, i + 1) - absmax = max(abs(np.max(data[:, i])), abs(np.min(data[:, i]))) + absmax = max(float(abs(np.max(data[:, i]))), float(abs(np.min(data[:, i])))) if bin_range == "absmax": rng = (-absmax, absmax) elif bin_range.endswith("sigma"): n = int(bin_range[: -len("sigma")]) mu = np.mean(data[:, i]) sigma = np.std(data[:, i], ddof=1) - rng = (mu - n * sigma, mu + n * sigma) + rng = (float(mu - n * sigma), float(mu + n * sigma)) h, bins = np.histogram(data[:, i], nbins, rng) bin_width = bins[1] - bins[0] h = h.astype(np.float64) / np.sum(h) plt.bar(bins[:-1], h, bin_width) - plt.axvline(np.mean(data[:, i]), color="red") + plt.axvline(float(np.mean(data[:, i])), color="red") plt.ylim(ylim) plt.title(f"{name:s}[{i:d}]") if fname is None: diff --git a/pyproject.toml b/pyproject.toml index 2bbdd6c..9463d8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,6 +83,7 @@ ignore = [ "PLR0913", # this is a scientific library, functions just have a bazillion parameters "S101", # nolds uses assers as legitimate checks for programming (not user) errors "N806", # some variables are kept close to the papers, which means they will have uppercase letters in them + "N803", # same as for variables: We use conventions from scientific papers, which inlcude uppercase letters ] allowed-confusables = [ "–", # en-dashes are used for bibliographical references in docstrings From f3ee7568503c2e75e3d270afb2ec61ab2b8643f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sat, 28 Jun 2025 22:02:04 +0200 Subject: [PATCH 14/36] fixes more linting errors --- nolds/measures.py | 1085 +++++++++++++++++++++++++++------------------ 1 file changed, 644 insertions(+), 441 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 047d8bb..bb76a18 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -1147,29 +1147,29 @@ def logmid_n( return np.unique(nvals) -def logarithmic_r(min_n: int, max_n: int, factor: float) -> list[float]: +def logarithmic_r(min_r: float, max_r: float, factor: float) -> list[float]: """Creates a list of real values that are equidistant on a logarithmic scale. The values are generated by successively multiplying a minimum value min_n by a factor > 1 until a maximum value max_n is reached. Args: - min_n: minimum value (must be < max_n) - max_n: maximum value (must be > min_n) - factor: factor used to increase min_n (must be > 1) + min_r: minimum value (must be < max_r) + max_r: maximum value (must be > min_r) + factor: factor used to increase min_r (must be > 1) Returns: - [min_n, min_n * factor, min_n * factor^2, ... min_n * factor^i] where - all values are < max_n. + [min_r, min_r * factor, min_r * factor^2, ... min_r * factor^i] where + all values are < max_r. """ - if max_n <= min_n: - msg = f"max_n must be larger than min_n ({max_n} <= {min_n})." + if max_r <= min_r: + msg = f"max_r must be larger than min_r ({max_r} <= {min_r})." raise ValueError(msg) if factor <= 1: msg = f"Factor must be larger than 1, but got {factor}." raise ValueError(msg) - max_i = int(np.floor(np.log(1.0 * max_n / min_n) / np.log(factor))) - return [min_n * (factor**i) for i in range(max_i + 1)] + max_i = int(np.floor(np.log(1.0 * max_r / min_r) / np.log(factor))) + return [min_r * (factor**i) for i in range(max_i + 1)] def expected_rs(n: int) -> float: @@ -1312,38 +1312,31 @@ def plot_histogram_matrix( def plot_reg( - xvals, - yvals, - poly, - x_label="x", - y_label="y", - data_label="data", - reg_label="regression line", - fname=None, + xvals: np.ndarray[tuple[int], np.dtype[np.float64]], + yvals: np.ndarray[tuple[int], np.dtype[np.float64]], + poly: np.ndarray[tuple[int], np.dtype[np.float64]] | None = None, + x_label: str = "x", + y_label: str = "y", + data_label: str = "data", + reg_label: str = "regression line", + fname: str | Path | None = None, ) -> None: - """Helper function to plot trend lines for line-fitting approaches. This - function will show a plot through ``plt.show()`` and close it after the - window has been closed by the user. + """Plots trend lines for line-fitting approaches. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless `fname` is provided, in which case + the plot will be saved to disc under the given file name instead. Args: - xvals (list/array of float): - list of x-values - yvals (list/array of float): - list of y-values - poly (list/array of float): - polynomial parameters as accepted by ``np.polyval`` - Kwargs: - x_label (str): - label of the x-axis - y_label (str): - label of the y-axis - data_label (str): - label of the data - reg_label(str): - label of the regression line - fname (str): - file name (if not None, the plot will be saved to disc instead of - showing it though ``plt.show()``) + xvals: list of x-values + yvals: list of y-values + poly: polynomial parameters as accepted by ``np.polyval`` + x_label: label of the x-axis + y_label: label of the y-axis + data_label: label of the data + reg_label: label of the regression line + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt @@ -1361,22 +1354,42 @@ def plot_reg( plt.close() +# TODO this is not used anywhere. Do we still need it? def plot_reg_tiled( - xvals, - yvals, - polys, - x_label="x", - y_label="y", - data_labels=None, - reg_labels=None, - fname=None, - columns=None, + xvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], + yvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], + polys: list[np.ndarray[tuple[int], np.dtype[np.float64]]] | None = None, + x_label: str = "x", + y_label: str = "y", + data_labels: list[str] | None = None, + reg_labels: list[str] | None = None, + fname: str | Path | None = None, + columns: int | None = None, ) -> None: - """TODO.""" + """Plots trend lines for multiple line-fitting approaches in a tiled layout. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless `fname` is provided, in which case + the plot will be saved to disc under the given file name instead. + + Args: + xvals: values on the x-axis in shape (#plots, #datapoints) + yvals: values on the y-axis in shape (#plots, #datapoints) + polys: polynomial parameters as accepted by ``np.polyval`` in shape + (#plots, #params) + x_label: x axis label + y_label: y axis label + data_labels: labels of the povided datasets + reg_labels: labels of the regression lines + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + columns: number of columns for the tiled view, defaults to minimum + number required to obtain a square grid. + """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt - max_span = max([np.max(y) - np.min(y) for y in yvals]) + max_span = max([float(np.max(y)) - float(np.min(y)) for y in yvals]) means = [np.mean(y) for y in yvals] if columns is None: columns = min(4, int(np.ceil(np.sqrt(len(xvals))))) @@ -1401,9 +1414,33 @@ def plot_reg_tiled( def plot_reg_multiple( - xvals, yvals, polys, x_label="x", y_label="y", data_labels=None, reg_labels=None, fname=None + xvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], + yvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], + polys: list[np.ndarray[tuple[int], np.dtype[np.float64]]] | None = None, + x_label: str = "x", + y_label: str = "y", + data_labels: list[str] | None = None, + reg_labels: list[str] | None = None, + fname: str | Path | None = None, ) -> None: - """TODO.""" + """Plots trend lines for multiple line-fitting approaches in a the same plot. + + This function will show a plot through ``plt.show()`` and close it after the + window has been closed by the user unless``fname` is provided, in which case + the plot will be saved to disc under the given file name instead. + + Args: + xvals: values on the x-axis in shape (#plots, #datapoints) + yvals: values on the y-axis in shape (#plots, #datapoints) + polys: polynomial parameters as accepted by ``np.polyval`` in shape + (#plots, #params) + x_label: x axis label + y_label: y axis label + data_labels: labels of the povided datasets + reg_labels: labels of the regression lines + fname: file name (if not None, the plot will be saved to disc instead of + showing it though ``plt.show()``) + """ import matplotlib.pyplot as plt if data_labels is None: @@ -1423,166 +1460,204 @@ def plot_reg_multiple( plt.savefig(fname) plt.close() +@overload +def hurst_rs( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + nvals: np.typing.IntArrayLike | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, +) -> float: ... + +@overload def hurst_rs( - data, - nvals=None, - fit="RANSAC", - debug_plot=False, - debug_data=False, - plot_file=None, - corrected=True, - unbiased=True, + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + nvals: np.typing.IntArrayLike | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, +) -> tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], +]: ... + + +def hurst_rs( + data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + nvals: np.typing.IntArrayLike | None = None, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, + corrected: bool = True, + unbiased: bool = True, +) -> ( + float + | tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + ] ): """Calculates the Hurst exponent by a standard rescaled range (R/S) approach. Explanation of Hurst exponent: - The Hurst exponent is a measure for the "long-term memory" of a - time series, meaning the long statistical dependencies in the data that do - not originate from cycles. - - It originates from H.E. Hursts observations of the problem of long-term - storage in water reservoirs. If x_i is the discharge of a river in year i - and we observe this discharge for N years, we can calculate the storage - capacity that would be required to keep the discharge steady at its mean - value. - - To do so, we first subtract the mean over all x_i from the individual - x_i to obtain the departures x'_i from the mean for each year i. As the - excess or deficit in discharge always carries over from year i to year i+1, - we need to examine the cumulative sum of x'_i, denoted by y_i. This - cumulative sum represents the filling of our hypothetical storage. If the - sum is above 0, we are storing excess discharge from the river, if it is - below zero we have compensated a deficit in discharge by releasing - water from the storage. The range (maximum - minimum) R of y_i therefore - represents the total capacity required for the storage. - - Hurst showed that this value follows a steady trend for varying N if it - is normalized by the standard deviation sigma over the x_i. Namely he - obtained the following formula: - - R/sigma = (N/2)^K - - In this equation, K is called the Hurst exponent. Its value is 0.5 for - white noise, but becomes greater for time series that exhibit some positive - dependency on previous values. For negative dependencies it becomes less - than 0.5. + The Hurst exponent is a measure for the "long-term memory" of a + time series, meaning the long statistical dependencies in the data that do + not originate from cycles. + + It originates from H.E. Hursts observations of the problem of long-term + storage in water reservoirs. If x_i is the discharge of a river in year i + and we observe this discharge for N years, we can calculate the storage + capacity that would be required to keep the discharge steady at its mean + value. + + To do so, we first subtract the mean over all x_i from the individual + x_i to obtain the departures x'_i from the mean for each year i. As the + excess or deficit in discharge always carries over from year i to year i+1, + we need to examine the cumulative sum of x'_i, denoted by y_i. This + cumulative sum represents the filling of our hypothetical storage. If the + sum is above 0, we are storing excess discharge from the river, if it is + below zero we have compensated a deficit in discharge by releasing + water from the storage. The range (maximum - minimum) R of y_i therefore + represents the total capacity required for the storage. + + Hurst showed that this value follows a steady trend for varying N if it + is normalized by the standard deviation sigma over the x_i. Namely he + obtained the following formula: + + R/sigma = (N/2)^K + + In this equation, K is called the Hurst exponent. Its value is 0.5 for + white noise, but becomes greater for time series that exhibit some positive + dependency on previous values. For negative dependencies it becomes less + than 0.5. Explanation of the algorithm: - The rescaled range (R/S) approach is directly derived from Hurst's - definition. The time series of length N is split into non-overlapping - subseries of length n. Then, R and S (S = sigma) are calculated for each - subseries and the mean is taken over all subseries yielding (R/S)_n. This - process is repeated for several lengths n. Finally, the exponent K is - obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). + The rescaled range (R/S) approach is directly derived from Hurst's + definition. The time series of length N is split into non-overlapping + subseries of length n. Then, R and S (S = sigma) are calculated for each + subseries and the mean is taken over all subseries yielding (R/S)_n. This + process is repeated for several lengths n. Finally, the exponent K is + obtained by fitting a straight line to the plot of log((R/S)_n) vs log(n). - There seems to be no consensus how to chose the subseries lenghts n. - This function therefore leaves the choice to the user. The module provides - some utility functions for "typical" values: + There seems to be no consensus how to chose the subseries lenghts n. + This function therefore leaves the choice to the user. The module provides + some utility functions for "typical" values: * binary_n: N/2, N/4, N/8, ... * logarithmic_n: min_n, min_n * f, min_n * f^2, ... References: - .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” - International Association of Scientific Hydrology. Bulletin, vol. 1, - no. 3, pp. 13–27, 1956. - .. [h_2] H. E. Hurst, “A suggested statistical model of some time series - which occur in nature,” Nature, vol. 180, p. 494, 1957. - .. [h_3] R. Weron, “Estimating long-range dependence: finite sample - properties and confidence intervals,” Physica A: Statistical Mechanics - and its Applications, vol. 312, no. 1, pp. 285–299, 2002. + .. [h_1] H. E. Hurst, “The problem of long-term storage in reservoirs,” + International Association of Scientific Hydrology. Bulletin, vol. 1, + no. 3, pp. 13–27, 1956. + .. [h_2] H. E. Hurst, “A suggested statistical model of some time series + which occur in nature,” Nature, vol. 180, p. 494, 1957. + .. [h_3] R. Weron, “Estimating long-range dependence: finite sample + properties and confidence intervals,” Physica A: Statistical Mechanics + and its Applications, vol. 312, no. 1, pp. 285–299, 2002. Reference Code: - .. [h_a] "hurst" function in R-package "pracma", - url: https://cran.r-project.org/web/packages/pracma/pracma.pdf - - Note: Pracma yields several estimates of the Hurst exponent, which - are listed below. Unless otherwise stated they use the divisors - of the length of the sequence as n. The length is reduced by at - most 1% to find the value that has the most divisors. - - * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for - n = N. - * The "theoretical Hurst exponent" is the value that would be - expected of an uncorrected rescaled range approach for random - noise of the size of the input data. - * The "empirical Hurst exponent" is the uncorrected Hurst exponent - obtained by the rescaled range approach. - * The "corrected empirical Hurst exponent" is the - Anis-Lloyd-Peters corrected Hurst exponent, but with - sqrt(1/2 * pi * n) added to the (R/S)_n before the log. - * The "corrected R over S Hurst exponent" uses the R-function "lm" - instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... - by successively halving the subsequences (which means that some - subsequences may be one element longer than others). In contrast - to its name it does not use the Anis-Lloyd-Peters correction - factor. - - If you want to compare the output of pracma to the output of - nolds, the "empirical hurst exponent" is the only measure that - exactly corresponds to the Hurst measure implemented in nolds - (by choosing corrected=False, fit="poly" and employing the same - strategy for choosing n as the divisors of the (reduced) - sequence length). - .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst - exponent using R/S Analysis", - url: https://ideas.repec.org/c/wuu/hscode/m11003.html - - Note: When the same values for nvals are used and fit is set to - "poly", nolds yields exactly the same results as this - implementation. - .. [h_c] Bill Davidson, "Hurst exponent", - url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent + .. [h_a] "hurst" function in R-package "pracma", + url: https://cran.r-project.org/web/packages/pracma/pracma.pdf + + Note: Pracma yields several estimates of the Hurst exponent, which + are listed below. Unless otherwise stated they use the divisors + of the length of the sequence as n. The length is reduced by at + most 1% to find the value that has the most divisors. + + * The "Simple R/S" estimate is just log((R/S)_n) / log(n) for + n = N. + * The "theoretical Hurst exponent" is the value that would be + expected of an uncorrected rescaled range approach for random + noise of the size of the input data. + * The "empirical Hurst exponent" is the uncorrected Hurst exponent + obtained by the rescaled range approach. + * The "corrected empirical Hurst exponent" is the + Anis-Lloyd-Peters corrected Hurst exponent, but with + sqrt(1/2 * pi * n) added to the (R/S)_n before the log. + * The "corrected R over S Hurst exponent" uses the R-function "lm" + instead of pracmas own "polyfit" and uses n = N/2, N/4, N/8, ... + by successively halving the subsequences (which means that some + subsequences may be one element longer than others). In contrast + to its name it does not use the Anis-Lloyd-Peters correction + factor. + + If you want to compare the output of pracma to the output of + nolds, the "empirical hurst exponent" is the only measure that + exactly corresponds to the Hurst measure implemented in nolds + (by choosing corrected=False, fit="poly" and employing the same + strategy for choosing n as the divisors of the (reduced) + sequence length). + .. [h_b] Rafael Weron, "HURST: MATLAB function to compute the Hurst + exponent using R/S Analysis", + url: https://ideas.repec.org/c/wuu/hscode/m11003.html + + Note: When the same values for nvals are used and fit is set to + "poly", nolds yields exactly the same results as this + implementation. + .. [h_c] Bill Davidson, "Hurst exponent", + url: http://www.mathworks.com/matlabcentral/fileexchange/9842-hurst-exponent Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - sizes of subseries to use - (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 - logarithmically spaced values in the medium 25% of the logarithmic range) - - Generally, the choice for n is a trade-off between the length and the - number of the subsequences that are used for the calculation of the - (R/S)_n. Very low values of n lead to high variance in the ``r`` and - ``s`` while very high values may leave too few subsequences that the mean - along them is still meaningful. Logarithmic spacing makes sense, because - it translates to even spacing in the log-log-plot. - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` - corrected (boolean): - if True, the Anis-Lloyd-Peters correction factor will be applied to the - output according to the expected value for the individual (R/S)_n - (see [h_3]_) - unbiased (boolean): - if True, the standard deviation based on the unbiased variance - (1/(N-1) instead of 1/N) will be used. This should be the default choice, - since the true mean of the sequences is not known. This parameter should - only be changed to recreate results of other implementations. + data: time series + nvals: sizes of subseries to use + (default: logmid_n(total_N, ratio=1/4.0, nsteps=15) , that is 15 + logarithmically spaced values in the medium 25% of the logarithmic range) + + Generally, the choice for n is a trade-off between the length and the + number of the subsequences that are used for the calculation of the + (R/S)_n. Very low values of n lead to high variance in the ``r`` and + ``s`` while very high values may leave too few subsequences that the mean + along them is still meaningful. Logarithmic spacing makes sense, because + it translates to even spacing in the log-log-plot. + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` + corrected: if True, the Anis-Lloyd-Peters correction factor will be applied to the + output according to the expected value for the individual (R/S)_n + (see [h_3]_) + unbiased: if True, the standard deviation based on the unbiased variance + (1/(N-1) instead of 1/N) will be used. This should be the default choice, + since the true mean of the sequences is not known. This parameter should + only be changed to recreate results of other implementations. Returns: - float: - estimated Hurst exponent K using a rescaled range approach (if K = 0.5 + Estimated Hurst exponent K using a rescaled range approach (if K = 0.5 there are no long-range correlations in the data, if K < 0.5 there are negative long-range correlations, if K > 0.5 there are positive - long-range correlations) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, rsvals, poly)`` where ``nvals`` are the values used for log(n), - ``rsvals`` are the corresponding log((R/S)_n) and ``poly`` are the line - coefficients (``[slope, intercept]``) + long-range correlations), + + If ``debug_data`` is True, the return value is instead a tuple containing + + * the Hurst exponent K + * a tuple of three arrays: + - nvals: the values used for n + - rsvals: the corresponding (R/S)_n values + - poly: the coefficients of the line fit (``[slope, intercept]`` """ data = np.asarray(data) total_N = len(data) @@ -1591,6 +1666,8 @@ def hurst_rs( # spaced datapoints leaning towards the middle of the logarithmic range # (since both too small and too large n introduce too much variance) nvals = logmid_n(total_N, ratio=1 / 4.0, nsteps=15) + else: + nvals = np.array(nvals, dtype=np.int32) # get individual values for (R/S)_n rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals]) # filter NaNs (zeros should not be possible, because if R is 0 then @@ -1600,7 +1677,7 @@ def hurst_rs( nvals = np.asarray(nvals)[not_nan] # it may happen that no rsvals are left (if all values of data are the same) if len(rsvals) == 0: - poly = [np.nan, np.nan] + poly = np.array([np.nan, np.nan], dtype=np.float64) if debug_plot: warnings.warn( "Cannot display debug plot, all (R/S)_n are NaN", @@ -1623,125 +1700,160 @@ def hurst_rs( return (h, (np.log(nvals), np.log(rsvals), poly)) return h +@overload +def mfhurst_b( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + dists: np.typing.FloatArrayLike | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: ... -# TODO implement MFDFA as second (more reliable) measure for multifractality -# NOTE: probably not needed, since mfhurst_b is already pretty reliable - +@overload +def mfhurst_b( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + dists: np.typing.FloatArrayLike | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], +]: ... def mfhurst_b( - data, qvals=None, dists=None, fit="poly", debug_plot=False, debug_data=False, plot_file=None + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + dists: np.typing.FloatArrayLike | None = None, + fit: FittingMethod = "poly", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + np.ndarray[tuple[int], np.dtype[np.float64]] + | tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + ] ): - r"""Calculates the Generalized Hurst Exponent H_q for different q according to - A.-L. Barabási and T. Vicsek. + r"""Calculates the Generalized Hurst Exponent H_q according to A.-L. Barabási and T. Vicsek. Explanation of the Generalized Hurst Exponent: - The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) - be seen as a generalization of the Hurst exponent for data series with - multifractal properties. It's origins are however not directly related - to Hurst's rescaled range approach, but to the definition of self-affine - functions. - - A single-valued self-affine function h by definition satisfies the relation - - h(x) ~= lambda^(-H) h(lambda x) - - for any positive real valued lambda and some positive real valued exponent - H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent - in the literature. In other words you can view lambda as a scaling factor - or "step size". With lambda < 1 we decrease the step size and zoom into our - function. In this case lambda^(-H) becomes greater than one, meaning that - h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we - zoom out and get lambda^(-H) < 1. - - To calculate H, you can use the height-height correlation function (also - called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x - denotes the expected value over x. Here, the aforementioned self-affine - property is equivalent to c(d) ~ d^(2H). You can also think of d as a step - size. Increasing or decreasing d from 1 to some y is the same as setting - lambda = y: It increases or decreases the scale of the function by a factor - of 1/y^(-H) = y^H. Therefore the squared differences will be proportional - to y^2H. - - A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy - of exponents H_q for the qth-order correlation function with - - c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) - - With q = 1 you get a value H_1 that is closely related to the normal Hurst - exponent, but with different q you either get a constant value H_q = H_0 - independent of q, which indicates that the function has no multifractal - properties, or different H_q, which is a sign for multifractal behavior. - - T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to - financial data series and gave it the name "Generalized Hurst Exponent". + The Generalized Hurst Exponent (GHE, H_q or H(q)) can (as the name implies) + be seen as a generalization of the Hurst exponent for data series with + multifractal properties. It's origins are however not directly related + to Hurst's rescaled range approach, but to the definition of self-affine + functions. + + A single-valued self-affine function h by definition satisfies the relation + + h(x) ~= lambda^(-H) h(lambda x) + + for any positive real valued lambda and some positive real valued exponent + H, which is called the Hurst, Hölder, Hurst-Hölder or roughness exponent + in the literature. In other words you can view lambda as a scaling factor + or "step size". With lambda < 1 we decrease the step size and zoom into our + function. In this case lambda^(-H) becomes greater than one, meaning that + h(lambda x) looks similar to a smaller version of h(x). With lambda > 1 we + zoom out and get lambda^(-H) < 1. + + To calculate H, you can use the height-height correlation function (also + called autocorrelation) c(d) = <(h(x) - h(x + d))^2>_x where <...>_x + denotes the expected value over x. Here, the aforementioned self-affine + property is equivalent to c(d) ~ d^(2H). You can also think of d as a step + size. Increasing or decreasing d from 1 to some y is the same as setting + lambda = y: It increases or decreases the scale of the function by a factor + of 1/y^(-H) = y^H. Therefore the squared differences will be proportional + to y^2H. + + A.-L. Barabási and T. Vicsek extended this notion to an infinite hierarchy + of exponents H_q for the qth-order correlation function with + + c_q(d) = <(h(x) - h(x + d))^q>_x ~ d^(q H_q) + + With q = 1 you get a value H_1 that is closely related to the normal Hurst + exponent, but with different q you either get a constant value H_q = H_0 + independent of q, which indicates that the function has no multifractal + properties, or different H_q, which is a sign for multifractal behavior. + + T. Di Matteo, T. Aste and M. M. Dacorogna applied this technique to + financial data series and gave it the name "Generalized Hurst Exponent". Explanation of the Algorithm: - Curiously, I could not find any algorithmic description how to calculate - H_q in the literature. Researchers seem to just imply that you can obtain - the exponent by a line fitting algorithm in a log-log plot, but they do not - talk about the actual procedure or the required parameters. + Curiously, I could not find any algorithmic description how to calculate + H_q in the literature. Researchers seem to just imply that you can obtain + the exponent by a line fitting algorithm in a log-log plot, but they do not + talk about the actual procedure or the required parameters. - Essentially, we can calculate c_q(d) of a discrete evenly sampled time - series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences - [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to - the qth power and taking the mean. + Essentially, we can calculate c_q(d) of a discrete evenly sampled time + series Y = [y_0, y_1, y_2, ... y_(N-1)] by taking the absolute differences + [\|y_0 - y_d\|, \|y_1 - y_(d+1)\|, ... , \|y_(N-d-1) - y_(N-1)\|] raising them to + the qth power and taking the mean. - Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) - and get + Now we take the logarithm on both sides of our relation c_q(d) ~ d^(q H_q) + and get - log(c_q(d)) ~ log(d) * q H_q + log(c_q(d)) ~ log(d) * q H_q - So in other words if we plot log(c_q(d)) against log(d) for several d we - should get a straight line with slope q H_q. This enables us to use a - linear least squares algorithm to obtain H_q. + So in other words if we plot log(c_q(d)) against log(d) for several d we + should get a straight line with slope q H_q. This enables us to use a + linear least squares algorithm to obtain H_q. - Note that we consider x as a discrete variable in the range 0 <= x < N. - We can do this, because the actual sampling rate of our data series does - not alter the result. After taking the logarithm any scaling factor delta_x - would only result in an additive term since - log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope - of the line and not the intercept. + Note that we consider x as a discrete variable in the range 0 <= x < N. + We can do this, because the actual sampling rate of our data series does + not alter the result. After taking the logarithm any scaling factor delta_x + would only result in an additive term since + log(delta_x * x) = log(x) + log(delta_x) and we only care about the slope + of the line and not the intercept. References: - .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + .. [mh_1] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. Args: - data (array-like of float): - time series of data points (should be evenly sampled) - - Kwargs: - qvals (iterable of float or int): - values of q for which H_q should be calculated (default: [1]) - dists (iterable of int): - distances for which the height-height correlation should be calculated - (determines the x-coordinates in the log-log plot) - default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure - even spacing on the logarithmic axis - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: time series of data points (should be evenly sampled) + qvals: values of q for which H_q should be calculated (default: [1]) + dists: distances for which the height-height correlation should be calculated + (determines the x-coordinates in the log-log plot) + default: logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) to ensure + even spacing on the logarithmic axis + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - array of float: - list of H_q for every q given in ``qvals`` - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. + list of H_q for every q given in ``qvals``. If ``debug_data`` is True, + the return value is instead a tuple containing + + * the H_q values for each q in ``qvals`` + * a tuple of three arrays + - xvals: the logarithm of the distances used for the height-height + correlation + - yvals: the logarithm of the height-height correlations for each + distance (first dimension) and each q (second dimension) with shape + (len(dists), len(qvals)) + - poly: the coefficients of the line fit (``[slope, intercept]``) + for each q in the shape (len(qvals), 2). """ # transform to array if necessary data = np.asarray(data, dtype=np.float64) @@ -1749,16 +1861,19 @@ def mfhurst_b( # actual default parameter would introduce shared list # see: http://pylint-messages.wikidot.com/messages:w0102 qvals = [1] + qvals = np.asarray(qvals, dtype=np.float64) if dists is None: - dists = logarithmic_n(1, max(20, 0.02 * len(data)), 1.5) - dists = np.asarray(dists) - if len(data) < 60: + dists = logarithmic_n(1, np.ceil(max(20, 0.02 * len(data))), 1.5) + dists = np.asarray(dists, dtype=np.float64) + min_reliable_n = 60 + if len(data) < min_reliable_n: warnings.warn( f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, ) - def hhcorr(d, q): + def hhcorr(d: int, q: int) -> float: + """Calculates the height-height correlation for a given distance d and q.""" diffs = np.abs(data[:-d] - data[d:]) diffs = diffs[np.where(diffs > 0)] return np.mean(diffs**q) @@ -1771,16 +1886,18 @@ def hhcorr(d, q): # line fitting xvals = np.log(dists) yvals = np.log(corrvals) - polys = [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))] + polys = np.array( + [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))], dtype=np.float64 + ) H = np.array(polys)[:, 0] / qvals if debug_plot: plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], + np.array([xvals] * len(qvals), dtype=np.float64), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), [p / q for p, q in zip(polys, qvals)], x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], + data_labels=[f"q = {q}" for q in qvals], reg_labels=[f"reg. line (H = {h:.3f})" for h in H], fname=plot_file, ) @@ -1789,7 +1906,7 @@ def hhcorr(d, q): return H -def _genhurst(S, q): +def _genhurst(S: np.ndarray[tuple[int], np.dtype[np.float64]], q: float) -> float: """Computes the generalized hurst exponent H_q for time series S. This function should not be used. It is only kept here to demonstrate that @@ -1834,7 +1951,7 @@ def _genhurst(S, q): ## formatting and datatype fixes : Christopher Schölzel, 17/02/2019 ## """ L = len(S) - if L < 100: + if L < 100: # noqa: PLR2004 warnings.warn("Data series very short!", stacklevel=2) H = np.zeros((len(range(5, 20)), 1)) k = 0 @@ -1872,12 +1989,16 @@ def _genhurst(S, q): ) H[k] = SSxy / SSxx k = k + 1 - return np.mean(H) / q + return float(np.mean(H) / q) + +def _aste_line_fit( + x: np.typing.IntArrayLike | np.typing.FloatArrayLike, + y: np.typing.IntArrayLike | np.typing.FloatArrayLike, +) -> list[float]: + """Simple linear regression with ordinary least squares. -def _aste_line_fit(x, y): - """Simple linear regression with ordinary least squares - https://en.wikipedia.org/wiki/Simple_linear_regression. + See https://en.wikipedia.org/wiki/Simple_linear_regression. NOTE: this function is left here to demonstrate the correctness of T. Aste's MATLAB code for hurst_multifractal_dm. You can get the same @@ -1896,7 +2017,7 @@ def _aste_line_fit(x, y): # = sum(x ^ 2) - N * mx ^ 2 var = np.sum(x**2) - N * mx * mx # corvariance of x and y - # sum((x - mx) * (y - my)) + # sum((x - mx) * (y - my)) # noqa: ERA001 # = sum(xy) - sum(mx * y) - sum(my * x) + N * mx * my # = sum(xy) - mx * sum(y) - my * sum(x) + N * mx * my # = sum(xy) - mx * my * N - my * mx * N + N * mx * my @@ -1919,22 +2040,51 @@ def _aste_line_fit(x, y): return [intercept, slope] +@overload def mfhurst_dm( - data, - qvals=None, - max_dists=range(5, 20), - detrend=True, - fit="poly", - debug_plot=False, - debug_data=False, - plot_file=None, + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + max_dists: np.typing.IntArrayLike | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +): ... + + +@overload +def mfhurst_dm( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + max_dists: np.typing.IntArrayLike | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +): ... + + +def mfhurst_dm( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + qvals: np.typing.FloatArrayLike | None = None, + max_dists: np.typing.IntArrayLike | None = None, + *, + detrend: bool = True, + fit: FittingMethod = "poly", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, ): - """Calculates the Generalized Hurst Exponent H_q for different q according to - the MATLAB code of Tomaso Aste - one of the authors that introduced this - measure. + """Calculates the Generalized Hurst Exponent H_q according to Di Mattheo and Aste. + + This implementation is a port of the MATLAB code of Tomaso Aste. Explanation of the General Hurst Exponent: - See mfhurst_b. + See mfhurst_b. Warning: I do not recommend to use this function unless you want to reproduce examples from Di Matteo et al.. From my experiments and a critical code @@ -1967,55 +2117,46 @@ def mfhurst_dm( it will only introduce a bias towards low values for d. References: - .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. + .. [mhd_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. Reference code: - .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", - url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent + .. [mhd_a] Tomaso Aste, "Generalized Hurst exponent", + url: http://de.mathworks.com/matlabcentral/fileexchange/30076-generalized-hurst-exponent Args: - data (1d-vector of float): - input data (should be evenly sampled) - qvals (1d-vector of float) - values of q for which H_q should be calculated (default: [1]) - - Kwargs: - max_dists (1d-vector of int): - different values to test for tau_max, the maximum value for the distance - d. The resulting H_q will be a mean of all H_q calculated with tau_max - = max_dists[0], max_dists[1], ... . - detrend (boolean): - if True, a linear trend will be removed from the data before H_q will - be calculated - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: input data (should be evenly sampled) + qvals: values of q for which H_q should be calculated (default: [1]) + max_dists: different values to test for tau_max, the maximum value for the distance + d. The resulting H_q will be a mean of all H_q calculated with tau_max + = max_dists[0], max_dists[1], ... . + detrend: if True, a linear trend will be removed from the data before H_q will + be calculated + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - array of float: array of mH_q for every q given in ``qvals`` where mH_q is the mean of all H_q calculated for different max distances in max_dists. - array of float: - array of standard deviations sH_q for each mH_q returned - (1d-vector, 2d-vector, 2d-vector): - only present if debug_data is True: debug data of the form - ``(xvals, yvals, poly)`` where ``xvals`` is the logarithm of ``dists``, - ``yvals`` are the logarithms of the corresponding height-height- - correlations for each distance (first dimension) and each q - (second dimension) in the shape len(dists) x len(qvals) and ``poly`` are - the line coefficients (``[slope, intercept]``) for each q in the shape - len(qvals) x 2. + + If ``debug_data`` is True, the return value is instead a tuple containing + + * mH_q: array of mean H_q for each q in ``qvals`` + * debug_data: a tuple of three arrays + - xvals: the logarithm of the distances used for the height-height + correlation + - yvals: the logarithm of the height-height correlations for each + distance (first dimension) and each q (second dimension) with shape + (len(dists), len(qvals)) + - poly: the coefficients of the line fit (``[slope, intercept]``) + for each q in the shape (len(qvals), 2). """ # transform to array if necessary data = np.asarray(data) @@ -2023,7 +2164,12 @@ def mfhurst_dm( # actual default parameter would introduce shared list # see: http://pylint-messages.wikidot.com/messages:w0102 qvals = [1] - if len(data) < 60: + qvals = np.asarray(qvals, dtype=np.float64) + if max_dists is None: + max_dists = range(5, 20) + max_dists = np.asarray(max_dists, dtype=np.int32) + min_reliable_n = 60 + if len(data) < min_reliable_n: warnings.warn( f"H(q) is not reliable for small time series ({len(data)} < 60)", stacklevel=2, @@ -2061,12 +2207,12 @@ def mfhurst_dm( if debug_plot: polys = [np.array(poly_fit(xvals, yvals[:, qi], 1)) / qvals[qi] for qi in range(len(qvals))] plot_reg_multiple( - [xvals] * len(qvals), - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], + np.array([xvals] * len(qvals), dtype=np.float64), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), polys, x_label="log(x)", y_label="$\\log(c_q(x)) / q$", - data_labels=["q = %d" % q for q in qvals], + data_labels=[f"q = {q}" for q in qvals], reg_labels=[f"reg. line (H = {h:.3f})" for h in H[:, -1] / qvals], fname=plot_file, ) @@ -2076,103 +2222,160 @@ def mfhurst_dm( return [mH, sH, (xvals, yvals, polys)] return [mH, sH] +@overload +def corr_dim( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + emb_dim: int = 2, + lag: int = 1, + rvals: np.typing.FloatArrayLike | None = None, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> float: ... + +@overload def corr_dim( - data, - emb_dim, - lag=1, - rvals=None, - dist=rowwise_euclidean, - fit="RANSAC", - debug_plot=False, - debug_data=False, - plot_file=None, + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + emb_dim: int = 2, + lag: int = 1, + rvals: np.typing.FloatArrayLike | None = None, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +) -> tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], +]: ... + +def corr_dim( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + emb_dim: int = 2, + lag: int = 1, + rvals: np.typing.FloatArrayLike | None = None, + dist: Callable[ + [ + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + np.ndarray[tuple[int], np.dtype[np.float64]], + ] = rowwise_euclidean, + fit: FittingMethod = "RANSAC", + *, + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + float + | tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + ] ): """Calculates the correlation dimension with the Grassberger-Procaccia algorithm. Explanation of correlation dimension: - The correlation dimension is a characteristic measure that can be used - to describe the geometry of chaotic attractors. It is defined using the - correlation sum C(r) which is the fraction of pairs of points X_i in the - phase space whose distance is smaller than r. + The correlation dimension is a characteristic measure that can be used + to describe the geometry of chaotic attractors. It is defined using the + correlation sum C(r) which is the fraction of pairs of points X_i in the + phase space whose distance is smaller than r. - If the relation between C(r) and r can be described by the power law + If the relation between C(r) and r can be described by the power law - C(r) ~ r^D + C(r) ~ r^D - then D is called the correlation dimension of the system. + then D is called the correlation dimension of the system. - In a d-dimensional system, the maximum value for D is d. This value is - obtained for systems that expand uniformly in each dimension with time. - The lowest possible value is 0 for a system with constant C(r) (i.e. a - system that visits just one point in the phase space). Generally if D is - lower than d and the system has an attractor, this attractor is called - "strange" and D is a measure of this "strangeness". + In a d-dimensional system, the maximum value for D is d. This value is + obtained for systems that expand uniformly in each dimension with time. + The lowest possible value is 0 for a system with constant C(r) (i.e. a + system that visits just one point in the phase space). Generally if D is + lower than d and the system has an attractor, this attractor is called + "strange" and D is a measure of this "strangeness". Explanation of the algorithm: - The Grassberger-Procaccia algorithm calculates C(r) for a range of - different r and then fits a straight line into the plot of log(C(r)) - versus log(r). - - This version of the algorithm is created for one-dimensional (scalar) time - series. Therefore, before calculating C(r), a delay embedding of the time - series is performed to yield emb_dim dimensional vectors - Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing - a higher value for emb_dim allows to reconstruct higher dimensional dynamics - and avoids "systematic errors due to corrections to scaling". Choosing a - higher value for lag allows to avoid overestimating correlation because - X_i ~= X_i+1, but it should also not be set too high to not underestimate - correlation due to exponential divergence of trajectories in chaotic systems. + The Grassberger-Procaccia algorithm calculates C(r) for a range of + different r and then fits a straight line into the plot of log(C(r)) + versus log(r). + + This version of the algorithm is created for one-dimensional (scalar) time + series. Therefore, before calculating C(r), a delay embedding of the time + series is performed to yield emb_dim dimensional vectors + Y_i = [X_i, X_(i+1*lag), X_(i+2*lag), ... X_(i+(embd_dim-1)*lag)]. Choosing + a higher value for emb_dim allows to reconstruct higher dimensional dynamics + and avoids "systematic errors due to corrections to scaling". Choosing a + higher value for lag allows to avoid overestimating correlation because + X_i ~= X_i+1, but it should also not be set too high to not underestimate + correlation due to exponential divergence of trajectories in chaotic systems. References: - .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange - attractors,” Physical review letters, vol. 50, no. 5, p. 346, - 1983. - .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of - strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” - Scholarpedia, vol. 2, no. 5, p. 3043. - urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm + .. [cd_1] P. Grassberger and I. Procaccia, “Characterization of strange + attractors,” Physical review letters, vol. 50, no. 5, p. 346, + 1983. + .. [cd_2] P. Grassberger and I. Procaccia, “Measuring the strangeness of + strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [cd_3] P. Grassberger, “Grassberger-Procaccia algorithm,” + Scholarpedia, vol. 2, no. 5, p. 3043. + urL: http://www.scholarpedia.org/article/Grassberger-Procaccia_algorithm Reference Code: - .. [cd_a] "corrDim" function in R package "fractal", - url: https://cran.r-project.org/web/packages/fractal/fractal.pdf - .. [cd_b] Peng Yuehua, "Correlation dimension", - url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension + .. [cd_a] "corrDim" function in R package "fractal", + url: https://cran.r-project.org/web/packages/fractal/fractal.pdf + .. [cd_b] Peng Yuehua, "Correlation dimension", + url: http://de.mathworks.com/matlabcentral/fileexchange/24089-correlation-dimension Args: - data (array-like of float): - time series of data points - emb_dim (int): - embedding dimension - Kwargs: - rvals (iterable of float): - list of values for to use for r - (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) - dist (function (2d-array, 1d-array) -> 1d-array): - row-wise difference function - fit (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: time series of data points + emb_dim: embedding dimension + rvals: list of values for to use for r + (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) + dist: row-wise difference function + fit: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - float: correlation dimension as slope of the line fitted to log(r) vs log(C(r)) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(rvals, csums, poly)`` where ``rvals`` are the values used for log(r), - ``csums`` are the corresponding log(C(r)) and ``poly`` are the line - coefficients (``[slope, intercept]``) + + If ``debug_data`` is True, the return value is instead a tuple containing + + - cd: the correlation dimension + - debug_data: tuple containing + - rvals: the values used for log(r) + - csums: the corresponding log(C(r)) + - poly: the line coefficients (``[slope, intercept]``) """ # TODO determine lag in units of time instead of number of datapoints data = np.asarray(data) @@ -2180,7 +2383,7 @@ def corr_dim( # TODO what are good values for r? # TODO do this for multiple values of emb_dim? if rvals is None: - sd = np.std(data, ddof=1) + sd = float(np.std(data, ddof=1)) rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) orbit = delay_embedding(data, emb_dim, lag=lag) n = len(orbit) @@ -2211,7 +2414,7 @@ def corr_dim( csums = csums[nonzero] if len(csums) == 0: # all sums are zero => we cannot fit a line - poly = [np.nan, np.nan] + poly = np.array([np.nan, np.nan], dtype=np.float64) else: poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) if debug_plot: From fdee540b900bc5f8c5937aa34cd339361356aca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sat, 28 Jun 2025 22:04:35 +0200 Subject: [PATCH 15/36] fixes linting errors in detrend_data --- nolds/measures.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nolds/measures.py b/nolds/measures.py index bb76a18..bef181e 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -2424,7 +2424,9 @@ def corr_dim( return poly[0] -def detrend_data(data, order=1, fit="poly"): +def detrend_data( + data: np.ndarray[tuple[int], np.dtype[np.float64]], order: int = 1, fit: FittingMethod = "poly" +) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Removes a trend of given order from the data.""" # TODO also use this function in dfa xvals = np.arange(len(data)) From 2725813742e2fa96664ac1a203dbe17882f0832b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 29 Jun 2025 22:14:05 +0200 Subject: [PATCH 16/36] fixes remaining linting errors except for TODOs --- nolds/measures.py | 430 ++++++++++++++++++++++++++++------------------ 1 file changed, 260 insertions(+), 170 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index bef181e..8109869 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -456,7 +456,14 @@ def nb_neighbors(lag_value: int) -> int: # normal line fitting poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) if debug_plot: - plot_reg(ks[fit_offset:], div_traj[fit_offset:], poly, "k", "log(d(k))", fname=plot_file) + plot_reg( + ks[fit_offset:].astype(np.float64), + div_traj[fit_offset:], + poly, + "k", + "log(d(k))", + fname=plot_file, + ) le = poly[0] / tau if debug_data: return (le, (ks, div_traj, poly)) @@ -1416,7 +1423,7 @@ def plot_reg_tiled( def plot_reg_multiple( xvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], yvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], - polys: list[np.ndarray[tuple[int], np.dtype[np.float64]]] | None = None, + polys: np.ndarray[tuple[int, int], np.dtype[np.float64]] | None = None, x_label: str = "x", y_label: str = "y", data_labels: list[str] | None = None, @@ -1460,6 +1467,7 @@ def plot_reg_multiple( plt.savefig(fname) plt.close() + @overload def hurst_rs( data: np.typing.FloatArrayLike | np.typing.IntArrayLike, @@ -1700,6 +1708,7 @@ def hurst_rs( return (h, (np.log(nvals), np.log(rsvals), poly)) return h + @overload def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, @@ -1712,6 +1721,7 @@ def mfhurst_b( plot_file: str | Path | None = None, ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: ... + @overload def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, @@ -1731,6 +1741,7 @@ def mfhurst_b( ], ]: ... + def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, qvals: np.typing.FloatArrayLike | None = None, @@ -1894,7 +1905,7 @@ def hhcorr(d: int, q: int) -> float: plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), - [p / q for p, q in zip(polys, qvals)], + np.array([p / q for p, q in zip(polys, qvals)], dtype=np.float64), x_label="log(x)", y_label="$\\log(c_q(x)) / q$", data_labels=[f"q = {q}" for q in qvals], @@ -2051,7 +2062,9 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, -): ... +) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int], np.dtype[np.float64]] +]: ... @overload @@ -2065,7 +2078,15 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, -): ... +) -> tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ], +]: ... def mfhurst_dm( @@ -2078,6 +2099,19 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, +) -> ( + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int], np.dtype[np.float64]] + ] + | tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + np.ndarray[tuple[int, int], np.dtype[np.float64]], + ], + ] ): """Calculates the Generalized Hurst Exponent H_q according to Di Mattheo and Aste. @@ -2143,13 +2177,17 @@ def mfhurst_dm( ``plt.show()`` Returns: - array of mH_q for every q given in ``qvals`` where mH_q is the mean of - all H_q calculated for different max distances in max_dists. + tuple containing + + - mH: array of mH_q for every q given in ``qvals`` where mH_q is the mean of + all H_q calculated for different max distances in max_dists. + - sH: same as mH, but calculating the standard deviation instead of the mean. If ``debug_data`` is True, the return value is instead a tuple containing - * mH_q: array of mean H_q for each q in ``qvals`` - * debug_data: a tuple of three arrays + - mH: array of mean H_q for each q in ``qvals`` + - sH: array of standard deviation of H_q for each q in ``qvals`` + - debug_data: a tuple of three arrays - xvals: the logarithm of the distances used for the height-height correlation - yvals: the logarithm of the height-height correlations for each @@ -2205,7 +2243,10 @@ def mfhurst_dm( dtype=np.float64, ).reshape(len(qvals), len(max_dists)) if debug_plot: - polys = [np.array(poly_fit(xvals, yvals[:, qi], 1)) / qvals[qi] for qi in range(len(qvals))] + polys = np.array( + [poly_fit(xvals, yvals[:, qi], 1) / qvals[qi] for qi in range(len(qvals))], + dtype=np.float64, + ) plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), @@ -2219,8 +2260,9 @@ def mfhurst_dm( mH = np.mean(H, axis=1) / qvals sH = np.std(H, axis=1) / qvals if debug_data: - return [mH, sH, (xvals, yvals, polys)] - return [mH, sH] + return (mH, sH, (xvals, yvals, polys)) + return (mH, sH) + @overload def corr_dim( @@ -2270,6 +2312,7 @@ def corr_dim( ], ]: ... + def corr_dim( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, emb_dim: int = 2, @@ -2354,6 +2397,8 @@ def corr_dim( Args: data: time series of data points emb_dim: embedding dimension + lag: the distance between two successive elements in the embedding vectors + (given in number of datapoints) rvals: list of values for to use for r (default: logarithmic_r(0.1 * std, 0.5 * std, 1.03)) dist: row-wise difference function @@ -2385,6 +2430,7 @@ def corr_dim( if rvals is None: sd = float(np.std(data, ddof=1)) rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) + rvals = np.asarray(rvals, dtype=np.float64) orbit = delay_embedding(data, emb_dim, lag=lag) n = len(orbit) dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64) @@ -2393,7 +2439,7 @@ def corr_dim( # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches # however, since both [cd_2] and [cd_3] specify to only compare i with j != i # or j > i respectively, it is safe to assume that this was an oversight in - # [cd_1] + # [cd_1] # noqa: ERA001 d = dist(orbit[i + 1 :], orbit[i]) dists[i + 1 :, i] = d # fill column i dists[i, i + 1 :] = d # fill row i @@ -2434,191 +2480,235 @@ def detrend_data( return data - np.polyval(trend, xvals) +@overload +def dfa( + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + nvals: np.typing.IntArrayLike | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[False] = False, + plot_file: str | Path | None = None, +) -> float: ... + + +@overload def dfa( - data, - nvals=None, - overlap=True, - order=1, - fit_trend="poly", - fit_exp="RANSAC", - debug_plot=False, - debug_data=False, - plot_file=None, + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + nvals: np.typing.IntArrayLike | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: Literal[True] = True, + plot_file: str | Path | None = None, +) -> tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], +]: ... + + +def dfa( # noqa: C901, PLR0912 + data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + nvals: np.typing.IntArrayLike | None = None, + *, + overlap: bool = True, + order: int = 1, + fit_trend: FittingMethod = "poly", + fit_exp: FittingMethod = "RANSAC", + debug_plot: bool = False, + debug_data: bool = False, + plot_file: str | Path | None = None, +) -> ( + float + | tuple[ + float, + tuple[ + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + np.ndarray[tuple[int], np.dtype[np.float64]], + ], + ] ): """Performs a detrended fluctuation analysis (DFA) on the given data. Recommendations for parameter settings by Hardstone et al.: - * nvals should be equally spaced on a logarithmic scale so that each window - scale hase the same weight - * min(nvals) < 4 does not make much sense as fitting a polynomial (even if - it is only of order 1) to 3 or less data points is very prone to errors. - * max(nvals) > len(data) / 10 does not make much sense as we will then have - less than 10 windows to calculate the average fluctuation - * use overlap=True to obtain more windows and therefore better statistics - (at an increased computational cost) + * nvals should be equally spaced on a logarithmic scale so that each window + scale hase the same weight + * min(nvals) < 4 does not make much sense as fitting a polynomial (even if + it is only of order 1) to 3 or less data points is very prone to errors. + * max(nvals) > len(data) / 10 does not make much sense as we will then have + less than 10 windows to calculate the average fluctuation + * use overlap=True to obtain more windows and therefore better statistics + (at an increased computational cost) Explanation of DFA: - Detrended fluctuation analysis, much like the Hurst exponent, is used to - find long-term statistical dependencies in time series. However, while the - Hurst exponent will indicate long-term correlations for any non-stationary - process (i.e. a stochastic process whose probability distribution changes - when shifted in time, such as a random walk whose mean changes over time), - DFA was designed to distinguish between correlations that are purely an - artifact of non-stationarity and those that show inherent long-term - behavior of the studied system. - - Mathematically, the long-term correlations that we are interested in can - be characterized using the autocorrelation function C(s). For a time series - (x_i) with i = 1, ..., N it is defined as follows: - - C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) - - with y_i = x_i - mean(x). If there are no correlations at all, C(s) would - be zero for s > 0. For short-range correlations, C(s) will decline - exponentially, but for long-term correlations the decline follows a power - law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. - - Due to noise and underlying trends, calculating C(s) directly is usually not - feasible. The main idea of DFA is therefore to remove trends up to a given - order from the input data and analyze the remaining fluctuations. Trends - in this sense are smooth signals with monotonous or slowly oscillating - behavior that are caused by external effects and not the dynamical system - under study. - - To get a hold of these trends, the first step is to calculate the "profile" - of our time series as the cumulative sum of deviations from the mean, - effectively integrating our data. This both smoothes out measurement noise - and makes it easier to distinguish the fractal properties of bounded time - series (i.e. time series whose values cannot grow or shrink beyond certain - bounds such as most biological or physical signals) by applying random walk - theory (see [dfa_3]_ and [dfa_4]_). - - y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). - - After that, we split Y(i) into (usually non-overlapping) windows of length - n to calculate local trends at this given scale. The ith window of this - size has the form - - W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] - - The local trends are then removed for each window separately by fitting a - polynomial p_(n,i) to the window W_(n,i) and then calculating - W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). - - This leaves us with the deviations from the trend - the "fluctuations" - - that we are interested in. To quantify them, we take the root mean square - of these fluctuations. It is important to note that we have to sum up all - individual fluctuations across all windows and divide by the total number - of fluctuations here before finally taking the root as last step. Some - implementations apply another root per window, which skews the result. - - The resulting fluctuation F(n) is then only dependent on the window size n, - the scale at which we observe our data. It behaves similar to the - autocorrelation function in that it follows a power-law for long-term - correlations: - - F(n) ~ n^alpha - - Where alpha is the Hurst parameter, which we can obtain from fitting a line - into the plot of log(n) versus log(F(n)) and taking the slope. - - The result can be interpreted as follows: For alpha < 1 the underlying - process is stationary and can be modelled as fractional Gaussian noise with - H = alpha. This means for alpha = 0.5 we have no long-term correlation or - "memory", for 0.5 < alpha < 1 we have positive long-term correlations and - for alpha < 0.5 the long-term correlations are negative. - - For alpha > 1 the underlying process is non-stationary and can be modeled - as fractional Brownian motion with H = alpha - 1. + Detrended fluctuation analysis, much like the Hurst exponent, is used to + find long-term statistical dependencies in time series. However, while the + Hurst exponent will indicate long-term correlations for any non-stationary + process (i.e. a stochastic process whose probability distribution changes + when shifted in time, such as a random walk whose mean changes over time), + DFA was designed to distinguish between correlations that are purely an + artifact of non-stationarity and those that show inherent long-term + behavior of the studied system. + + Mathematically, the long-term correlations that we are interested in can + be characterized using the autocorrelation function C(s). For a time series + (x_i) with i = 1, ..., N it is defined as follows: + + C(s) = 1/(N-s) * (y_1 * y_1+s + y_2 * y_2+s + ... y_(N-s) * y_N) + + with y_i = x_i - mean(x). If there are no correlations at all, C(s) would + be zero for s > 0. For short-range correlations, C(s) will decline + exponentially, but for long-term correlations the decline follows a power + law of the form C(s) ~ s^(-gamma) instead with 0 < gamma < 1. + + Due to noise and underlying trends, calculating C(s) directly is usually not + feasible. The main idea of DFA is therefore to remove trends up to a given + order from the input data and analyze the remaining fluctuations. Trends + in this sense are smooth signals with monotonous or slowly oscillating + behavior that are caused by external effects and not the dynamical system + under study. + + To get a hold of these trends, the first step is to calculate the "profile" + of our time series as the cumulative sum of deviations from the mean, + effectively integrating our data. This both smoothes out measurement noise + and makes it easier to distinguish the fractal properties of bounded time + series (i.e. time series whose values cannot grow or shrink beyond certain + bounds such as most biological or physical signals) by applying random walk + theory (see [dfa_3]_ and [dfa_4]_). + + y_i = x_1 - mean(x) + x_2 - mean(x) + ... + x_i - mean(x). + + After that, we split Y(i) into (usually non-overlapping) windows of length + n to calculate local trends at this given scale. The ith window of this + size has the form + + W_(n,i) = [y_i, y_(i+1), y_(i+2), ... y_(i+n-1)] + + The local trends are then removed for each window separately by fitting a + polynomial p_(n,i) to the window W_(n,i) and then calculating + W'_(n,i) = W_(n,i) - p_(n,i) (element-wise subtraction). + + This leaves us with the deviations from the trend - the "fluctuations" - + that we are interested in. To quantify them, we take the root mean square + of these fluctuations. It is important to note that we have to sum up all + individual fluctuations across all windows and divide by the total number + of fluctuations here before finally taking the root as last step. Some + implementations apply another root per window, which skews the result. + + The resulting fluctuation F(n) is then only dependent on the window size n, + the scale at which we observe our data. It behaves similar to the + autocorrelation function in that it follows a power-law for long-term + correlations: + + F(n) ~ n^alpha + + Where alpha is the Hurst parameter, which we can obtain from fitting a line + into the plot of log(n) versus log(F(n)) and taking the slope. + + The result can be interpreted as follows: For alpha < 1 the underlying + process is stationary and can be modelled as fractional Gaussian noise with + H = alpha. This means for alpha = 0.5 we have no long-term correlation or + "memory", for 0.5 < alpha < 1 we have positive long-term correlations and + for alpha < 0.5 the long-term correlations are negative. + + For alpha > 1 the underlying process is non-stationary and can be modeled + as fractional Brownian motion with H = alpha - 1. References: - .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, - H. E. Stanley, and A. L. Goldberger, “Mosaic organization of - DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. - .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. - Havlin, and A. Bunde, “Detecting long-range correlations with - detrended fluctuation analysis,” Physica A: Statistical - Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, - Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. - .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal - mechanisms in neuronal control: human heartbeat and gait - dynamics in health and disease,” in Self-Organized Biological - Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., - Cambridge University Press, 2000, pp. 66–96. - doi: 10.1017/CBO9780511535338.006. - .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, - “Comparison of detrending methods for fluctuation analysis,” - Physica A: Statistical Mechanics and its Applications, vol. 387, - no. 21, pp. 5080–5090, Sep. 2008, - doi: 10.1016/j.physa.2008.04.023. - .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, - V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, - “Detrended fluctuation analysis: A scale-free view on neuronal - oscillations,” Frontiers in Physiology, vol. 30, 2012. + .. [dfa_1] C.-K. Peng, S. V. Buldyrev, S. Havlin, M. Simons, + H. E. Stanley, and A. L. Goldberger, “Mosaic organization of + DNA nucleotides,” Physical Review E, vol. 49, no. 2, 1994. + .. [dfa_2] J. W. Kantelhardt, E. Koscielny-Bunde, H. H. A. Rego, S. + Havlin, and A. Bunde, “Detecting long-range correlations with + detrended fluctuation analysis,” Physica A: Statistical + Mechanics and its Applications, vol. 295, no. 3–4, pp. 441–454, + Jun. 2001, doi: 10.1016/S0378-4371(01)00144-3. + .. [dfa_3] C. Peng, J. M. Hausdorff, and A. L. Goldberger, “Fractal + mechanisms in neuronal control: human heartbeat and gait + dynamics in health and disease,” in Self-Organized Biological + Dynamics and Nonlinear Control, 1st ed., J. Walleczek, Ed., + Cambridge University Press, 2000, pp. 66–96. + doi: 10.1017/CBO9780511535338.006. + .. [dfa_4] A. Bashan, R. Bartsch, J. W. Kantelhardt, and S. Havlin, + “Comparison of detrending methods for fluctuation analysis,” + Physica A: Statistical Mechanics and its Applications, vol. 387, + no. 21, pp. 5080–5090, Sep. 2008, + doi: 10.1016/j.physa.2008.04.023. + .. [dfa_5] R. Hardstone, S.-S. Poil, G. Schiavone, R. Jansen, + V. V. Nikulin, H. D. Mansvelder, and K. Linkenkaer-Hansen, + “Detrended fluctuation analysis: A scale-free view on neuronal + oscillations,” Frontiers in Physiology, vol. 30, 2012. Reference code: - .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", - url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html - .. [dfa_b] JE Mietus, "dfa", - url: https://www.physionet.org/physiotools/dfa/dfa-1.htm - .. [dfa_c] "DFA" function in R package "fractal" + .. [dfa_a] Peter Jurica, "Introduction to MDFA in Python", + url: http://bsp.brain.riken.jp/~juricap/mdfa/mdfaintro.html + .. [dfa_b] JE Mietus, "dfa", + url: https://www.physionet.org/physiotools/dfa/dfa-1.htm + .. [dfa_c] "DFA" function in R package "fractal" Args: - data (array-like of float): - time series - Kwargs: - nvals (iterable of int): - subseries sizes at which to calculate fluctuation - (default: logarithmic_n(4, 0.1*len(data), 1.2)) - overlap (boolean): - if True, the windows W_(n,i) will have a 50% overlap, - otherwise non-overlapping windows will be used - order (int): - (polynomial) order of trend to remove - fit_trend (str): - the fitting method to use for fitting the trends, either 'poly' - for normal least squares polynomial fitting or 'RANSAC' for - RANSAC-fitting which is more robust to outliers but also tends to - lead to unstable results - fit_exp (str): - the fitting method to use for the line fit, either 'poly' for normal - least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which - is more robust to outliers - debug_plot (boolean): - if True, a simple plot of the final line-fitting step will be shown - debug_data (boolean): - if True, debugging data will be returned alongside the result - plot_file (str): - if debug_plot is True and plot_file is not None, the plot will be saved - under the given file name instead of directly showing it through - ``plt.show()`` + data: time series + nvals: subseries sizes at which to calculate fluctuation + (default: logarithmic_n(4, 0.1*len(data), 1.2)) + overlap: if True, the windows W_(n,i) will have a 50% overlap, + otherwise non-overlapping windows will be used + order: (polynomial) order of trend to remove + fit_trend: the fitting method to use for fitting the trends, either 'poly' + for normal least squares polynomial fitting or 'RANSAC' for + RANSAC-fitting which is more robust to outliers but also tends to + lead to unstable results + fit_exp: the fitting method to use for the line fit, either 'poly' for normal + least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which + is more robust to outliers + debug_plot: if True, a simple plot of the final line-fitting step will be shown + debug_data: if True, debugging data will be returned alongside the result + plot_file: if debug_plot is True and plot_file is not None, the plot will be saved + under the given file name instead of directly showing it through + ``plt.show()`` Returns: - float: the estimate alpha for the Hurst parameter (alpha < 1: stationary process similar to fractional Gaussian noise with H = alpha, alpha > 1: non-stationary process similar to fractional Brownian motion with H = alpha - 1) - (1d-vector, 1d-vector, list): - only present if debug_data is True: debug data of the form - ``(nvals, fluctuations, poly)`` where ``nvals`` are the values used for - log(n), ``fluctuations`` are the corresponding log(std(X,n)) and ``poly`` - are the line coefficients (``[slope, intercept]``) + + If ``debug_data`` is set to ``True``, the return value is instead a tuple containing + + - H: Hurst parameter + - nvals: the values used for log(n) + - fluctuations: the corresponding log(std(X,n)) + - poly: the line coefficients (``[slope, intercept]``) """ data = np.asarray(data) total_N = len(data) if nvals is None: - if total_N > 70: - nvals = logarithmic_n(4, 0.1 * total_N, 1.2) - elif total_N > 10: + min_n_for_log_scale = 70 + min_n = 10 + if total_N > min_n_for_log_scale: + nvals = logarithmic_n(4, np.floor(0.1 * total_N), 1.2) + elif total_N > min_n: nvals = [4, 5, 6, 7, 8, 9] else: nvals = [total_N - 2, total_N - 1] msg = "choosing nvals = {} , DFA with less than ten data points is extremely unreliable" warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) - if len(nvals) < 2: + nvals = np.asarray(nvals, dtype=np.int32) + min_number_of_nvals = 2 + min_nval = 2 + if nvals.shape[0] < min_number_of_nvals: msg = "at least two nvals are needed" raise ValueError(msg) - if np.min(nvals) < 2: + if np.min(nvals) < min_nval: msg = "nvals must be at least two" raise ValueError(msg) if np.max(nvals) >= total_N: @@ -2658,7 +2748,7 @@ def dfa( fluctuations = fluctuations[nonzero] if len(fluctuations) == 0: # all fluctuations are zero => we cannot fit a line - poly = [np.nan, np.nan] + poly = np.array([np.nan, np.nan], dtype=np.float64) else: poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, fit=fit_exp) if debug_plot: From b870338bf61d22fbbf4930ec74aa02a94d76e33d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 29 Jun 2025 22:17:37 +0200 Subject: [PATCH 17/36] fixes some more linting errors that I missed --- nolds/measures.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 8109869..4a5f6cd 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -4,8 +4,10 @@ import math import warnings -from pathlib import Path -from typing import Callable, Literal, TypeVar, cast, overload +from typing import TYPE_CHECKING, Callable, Literal, TypeVar, cast, overload + +if TYPE_CHECKING: + from pathlib import Path import numpy as np @@ -2517,7 +2519,7 @@ def dfa( ]: ... -def dfa( # noqa: C901, PLR0912 +def dfa( # noqa: C901, PLR0912, PLR0915 data: np.typing.IntArrayLike | np.typing.FloatArrayLike, nvals: np.typing.IntArrayLike | None = None, *, @@ -2719,7 +2721,7 @@ def dfa( # noqa: C901, PLR0912 walk = np.cumsum(data - np.mean(data)) fluctuations = [] for n in nvals: - assert n >= 2 + assert n >= min_nval # subdivide data into chunks of size n if overlap: # step size n/2 instead of n From 16c1d02e57d1927d726693f50f1cca6969b3f5e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 27 Jul 2025 21:41:02 +0200 Subject: [PATCH 18/36] bugfix: dists can only be int, not float --- nolds/measures.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 4a5f6cd..45b23a7 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -1715,7 +1715,7 @@ def hurst_rs( def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.FloatArrayLike | None = None, + dists: np.typing.IntArrayLike | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, @@ -1728,7 +1728,7 @@ def mfhurst_b( def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.FloatArrayLike | None = None, + dists: np.typing.IntArrayLike | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, @@ -1747,7 +1747,7 @@ def mfhurst_b( def mfhurst_b( data: np.typing.IntArrayLike | np.typing.FloatArrayLike, qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.FloatArrayLike | None = None, + dists: np.typing.IntArrayLike | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, @@ -1877,7 +1877,7 @@ def mfhurst_b( qvals = np.asarray(qvals, dtype=np.float64) if dists is None: dists = logarithmic_n(1, np.ceil(max(20, 0.02 * len(data))), 1.5) - dists = np.asarray(dists, dtype=np.float64) + dists = np.asarray(dists, dtype=np.int32) min_reliable_n = 60 if len(data) < min_reliable_n: warnings.warn( @@ -1885,7 +1885,7 @@ def mfhurst_b( stacklevel=2, ) - def hhcorr(d: int, q: int) -> float: + def hhcorr(d: int, q: float) -> float: """Calculates the height-height correlation for a given distance d and q.""" diffs = np.abs(data[:-d] - data[d:]) diffs = diffs[np.where(diffs > 0)] From 6607a0830d0f749e3b308450de50f2cbcdcee491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sat, 9 Aug 2025 19:34:58 +0200 Subject: [PATCH 19/36] introduces type aliases for extra documentation --- nolds/measures.py | 402 ++++++++++++++++++++++++++-------------------- 1 file changed, 227 insertions(+), 175 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 45b23a7..b29a205 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -4,26 +4,46 @@ import math import warnings -from typing import TYPE_CHECKING, Callable, Literal, TypeVar, cast, overload - -if TYPE_CHECKING: - from pathlib import Path +from typing import ( + TYPE_CHECKING, + Callable, + Literal, + TypeAlias, + TypeVar, + cast, + overload, +) import numpy as np -D = TypeVar("D", bound=np.integer | np.floating) - +if TYPE_CHECKING: + from pathlib import Path -def rowwise_chebyshev( - x: np.ndarray[tuple[int, int], np.dtype[D]], y: np.ndarray[tuple[int], np.dtype[D]] -) -> np.ndarray[tuple[int], np.dtype[D]]: + from numpy.typing import ArrayLike + + D = TypeVar("D", bound=np.integer | np.floating) + # Array type definitions + # NOTE: We define aliases here to save space and to make it easy to update + # the types when numpy settles on a best practice for annotating array dimensions. + IntArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[np.int32]] + FloatArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray2D: TypeAlias = np.ndarray[tuple[int, int], np.dtype[np.float64]] + NumberArray1D: TypeAlias = np.ndarray[tuple[int], np.dtype[D]] + NumberArray2D: TypeAlias = np.ndarray[tuple[int, int], np.dtype[D]] + # Define more specific aliases for input data + # NOTE: These don't change anything in type checking, but the type name servers as + # additional documentation for users. + IntArrayLike1D: TypeAlias = ArrayLike # 1D structure containing int values + FloatArrayLike1D: TypeAlias = ArrayLike # 1D structure containing float values + NumberArrayLike1D: TypeAlias = ArrayLike # 1D structure containing number values + + +def rowwise_chebyshev(x: NumberArray2D, y: NumberArray1D) -> NumberArray1D: """Returns the Chebyshev distances between each row of matrix x and the reference row y.""" return np.max(np.abs(x - y), axis=1) -def rowwise_euclidean( - x: np.ndarray[tuple[int, int], np.dtype[D]], y: np.ndarray[tuple[int], np.dtype[D]] -) -> np.ndarray[tuple[int], np.dtype[D]]: +def rowwise_euclidean(x: NumberArray2D, y: NumberArray1D) -> NumberArray1D: """Returns the Euclidean distances between each row of matrix x and the reference row y.""" return np.sqrt(np.sum((x - y) ** 2, axis=1)) @@ -32,11 +52,11 @@ def rowwise_euclidean( def poly_fit( - x: np.ndarray[tuple[int], np.dtype[D]], - y: np.ndarray[tuple[int], np.dtype[D]], + x: NumberArray1D, + y: NumberArray1D, degree: int, fit: FittingMethod = "RANSAC", -) -> np.ndarray[tuple[int], np.dtype[np.float64]]: +) -> FloatArray1D: """Fits a polynomial of the given degree to the data. This currently supports two fittting algorithms. @@ -96,8 +116,8 @@ def poly_fit( def delay_embedding( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, emb_dim: int, lag: int = 1 -) -> np.ndarray[tuple[int, int], np.dtype[np.float64]]: + data: NumberArrayLike1D, emb_dim: int, lag: int = 1 +) -> FloatArray2D: """Perform a time-delay embedding of a time series. Args: @@ -153,7 +173,7 @@ def lyap_r_len(emb_dim: int, lag: int, trajectory_len: int, min_tsep: int) -> in @overload def lyap_r( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, emb_dim: int = 10, *, lag: int | None = None, @@ -171,7 +191,7 @@ def lyap_r( @overload def lyap_r( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, emb_dim: int = 10, *, lag: int | None = None, @@ -187,15 +207,15 @@ def lyap_r( ) -> tuple[ np.float64, tuple[ - np.ndarray[tuple[int], np.dtype[np.int32]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + IntArray1D, + FloatArray1D, + FloatArray1D, ], ]: ... def lyap_r( # noqa: C901, PLR0912, PLR0915 - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, emb_dim: int = 10, *, lag: int | None = None, @@ -213,9 +233,9 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 | tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.int32]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + IntArray1D, + FloatArray1D, + FloatArray1D, ], ] ): @@ -417,7 +437,9 @@ def nb_neighbors(lag_value: int) -> int: raise ValueError(msg.format(-ntraj + 1)) if ntraj < min_traj: # not enough data points => there are rows where all values are inf - assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), "no inf rows found" + assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), ( + "no inf rows found" + ) msg = ( "Not enough data points. At least {} trajectories are required " "to find a valid neighbor for each orbit vector with min_tsep={} " @@ -498,7 +520,7 @@ def lyap_e_len(emb_dim: int, matrix_dim: int, min_tsep: int, min_nb: int) -> int @overload def lyap_e( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 10, matrix_dim: int = 4, @@ -508,12 +530,12 @@ def lyap_e( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, -) -> np.ndarray[tuple[int], np.dtype[np.float64]]: ... +) -> FloatArray1D: ... @overload def lyap_e( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 10, matrix_dim: int = 4, @@ -524,12 +546,13 @@ def lyap_e( debug_data: Literal[True] = True, plot_file: str | Path | None = None, ) -> tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int, int], np.dtype[np.float64]] + FloatArray1D, + FloatArray2D, ]: ... def lyap_e( # noqa: C901, PLR0915 - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 10, matrix_dim: int = 4, @@ -540,10 +563,10 @@ def lyap_e( # noqa: C901, PLR0915 debug_data: bool = False, plot_file: str | Path | None = None, ) -> ( - np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray1D | tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int, int], np.dtype[np.float64]], + FloatArray1D, + FloatArray2D, ] ): r"""Estimates the Lyapunov exponents using the algorithm of Eckmann et al. [le_1]_. @@ -809,7 +832,7 @@ def lyap_e( # noqa: C901, PLR0915 def plot_dists( - dists: list[np.ndarray[tuple[int], np.dtype[np.float64]]], + dists: list[FloatArray1D], tolerance: float, m: int, title: str | None = None, @@ -836,7 +859,9 @@ def plot_dists( std = np.std(dists_full, ddof=1) rng = (0.0, float(mean + std * nstd)) colors = ["green", "blue"] - for i, (h, bins) in enumerate([np.histogram(dat, bins=nbins, range=rng) for dat in dists]): + for i, (h, bins) in enumerate( + [np.histogram(dat, bins=nbins, range=rng) for dat in dists] + ): bw = bins[1] - bins[0] plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) plt.axvline(tolerance, color="red") @@ -855,17 +880,17 @@ def plot_dists( @overload def sampen( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 2, tolerance: float | None = None, lag: int = 1, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_chebyshev, closed: bool = False, debug_plot: bool = False, @@ -876,17 +901,17 @@ def sampen( @overload def sampen( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 2, tolerance: float | None = None, lag: int = 1, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_chebyshev, closed: bool = False, debug_plot: bool = False, @@ -895,22 +920,22 @@ def sampen( ) -> tuple[ float, list[float], - list[np.ndarray[tuple[int], np.dtype[np.float64]]], + list[FloatArray1D], ]: ... def sampen( # noqa: C901, PLR0912 - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, + data: NumberArrayLike1D, *, emb_dim: int = 2, tolerance: float | None = None, lag: int = 1, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_chebyshev, closed: bool = False, debug_plot: bool = False, @@ -921,7 +946,7 @@ def sampen( # noqa: C901, PLR0912 | tuple[ float, list[float], - list[np.ndarray[tuple[int], np.dtype[np.float64]]], + list[FloatArray1D], ] ): """Computes the sample entropy of the given data. @@ -1063,7 +1088,11 @@ def sampen( # noqa: C901, PLR0912 saen = np.inf if debug_plot: plot_dists( - plot_data, cast("float", tolerance), m, title=f"sampEn = {saen:.3f}", fname=plot_file + plot_data, + cast("float", tolerance), + m, + title=f"sampEn = {saen:.3f}", + fname=plot_file, ) if debug_data: return (saen, counts, plot_data) @@ -1124,9 +1153,7 @@ def logarithmic_n(min_n: int, max_n: int, factor: float) -> list[int]: return ns -def logmid_n( - max_n: int, ratio: float = 1 / 4.0, nsteps: int = 15 -) -> np.ndarray[tuple[int], np.dtype[np.int32]]: +def logmid_n(max_n: int, ratio: float = 1 / 4.0, nsteps: int = 15) -> IntArray1D: """Creates an array of equidistant values in the "middle" of [0, max_n] on a logarithmic scale. If max_n is very small and/or nsteps is very large, this may lead to @@ -1181,7 +1208,7 @@ def logarithmic_r(min_r: float, max_r: float, factor: float) -> list[float]: return [min_r * (factor**i) for i in range(max_i + 1)] -def expected_rs(n: int) -> float: +def expected_rs(n: np.integer) -> float: """Approximates the expected (R/S)_n for white noise for a given n. This is used as a correction factor in the function hurst_rs. It uses the @@ -1204,7 +1231,7 @@ def expected_rs(n: int) -> float: return front * middle * back -def expected_h(nvals: np.typing.IntArrayLike, fit: FittingMethod = "RANSAC") -> float: +def expected_h(nvals: IntArrayLike1D, fit: FittingMethod = "RANSAC") -> float: """Uses expected_rs to calculate the expected value for the Hurst exponent h. Args: @@ -1222,9 +1249,7 @@ def expected_h(nvals: np.typing.IntArrayLike, fit: FittingMethod = "RANSAC") -> return poly[0] -def rs( - data: np.ndarray[tuple[int], np.dtype[np.float64]], n: int, *, unbiased: bool = True -) -> float: +def rs(data: FloatArray1D, n: np.integer, *, unbiased: bool = True) -> float: """Calculates an individual R/S value in the rescaled range approach for a given n. Note: This is just a helper function for hurst_rs and should not be called @@ -1271,9 +1296,11 @@ def rs( def plot_histogram_matrix( - data: np.ndarray[tuple[int, int], np.dtype[np.float64]], + data: FloatArray2D, name: str, - bin_range: Literal["absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma"] = "3sigma", + bin_range: Literal[ + "absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma" + ] = "3sigma", fname: str | Path | None = None, ) -> None: """Plot a quadratic matrix of histograms. @@ -1321,9 +1348,9 @@ def plot_histogram_matrix( def plot_reg( - xvals: np.ndarray[tuple[int], np.dtype[np.float64]], - yvals: np.ndarray[tuple[int], np.dtype[np.float64]], - poly: np.ndarray[tuple[int], np.dtype[np.float64]] | None = None, + xvals: FloatArray1D, + yvals: FloatArray1D, + poly: FloatArray1D | None = None, x_label: str = "x", y_label: str = "y", data_label: str = "data", @@ -1363,11 +1390,11 @@ def plot_reg( plt.close() -# TODO this is not used anywhere. Do we still need it? +# TODO: this is not used anywhere. Do we still need it? def plot_reg_tiled( - xvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], - yvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], - polys: list[np.ndarray[tuple[int], np.dtype[np.float64]]] | None = None, + xvals: FloatArray2D, + yvals: FloatArray2D, + polys: list[FloatArray1D] | None = None, x_label: str = "x", y_label: str = "y", data_labels: list[str] | None = None, @@ -1410,7 +1437,9 @@ def plot_reg_tiled( plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) if polys is not None: - plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) + plt.plot( + xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i] + ) plt.xlabel(x_label) plt.ylabel(y_label) plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) @@ -1423,9 +1452,9 @@ def plot_reg_tiled( def plot_reg_multiple( - xvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], - yvals: np.ndarray[tuple[int, int], np.dtype[np.float64]], - polys: np.ndarray[tuple[int, int], np.dtype[np.float64]] | None = None, + xvals: FloatArray2D, + yvals: FloatArray2D, + polys: FloatArray2D | None = None, x_label: str = "x", y_label: str = "y", data_labels: list[str] | None = None, @@ -1472,8 +1501,8 @@ def plot_reg_multiple( @overload def hurst_rs( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, fit: FittingMethod = "RANSAC", *, debug_plot: bool = False, @@ -1486,8 +1515,8 @@ def hurst_rs( @overload def hurst_rs( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, fit: FittingMethod = "RANSAC", *, debug_plot: bool = False, @@ -1498,16 +1527,16 @@ def hurst_rs( ) -> tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ]: ... def hurst_rs( - data: np.typing.FloatArrayLike | np.typing.IntArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, fit: FittingMethod = "RANSAC", *, debug_plot: bool = False, @@ -1520,9 +1549,9 @@ def hurst_rs( | tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ] ): @@ -1713,54 +1742,54 @@ def hurst_rs( @overload def mfhurst_b( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, -) -> np.ndarray[tuple[int], np.dtype[np.float64]]: ... +) -> FloatArray1D: ... @overload def mfhurst_b( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, ) -> tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ]: ... def mfhurst_b( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + dists: IntArrayLike1D | None = None, fit: FittingMethod = "poly", *, debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, ) -> ( - np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray1D | tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ] ): @@ -1885,7 +1914,7 @@ def mfhurst_b( stacklevel=2, ) - def hhcorr(d: int, q: float) -> float: + def hhcorr(d: np.integer, q: np.floating) -> np.floating: """Calculates the height-height correlation for a given distance d and q.""" diffs = np.abs(data[:-d] - data[d:]) diffs = diffs[np.where(diffs > 0)] @@ -1900,13 +1929,16 @@ def hhcorr(d: int, q: float) -> float: xvals = np.log(dists) yvals = np.log(corrvals) polys = np.array( - [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))], dtype=np.float64 + [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))], + dtype=np.float64, ) H = np.array(polys)[:, 0] / qvals if debug_plot: plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), - np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), + np.array( + [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64 + ), np.array([p / q for p, q in zip(polys, qvals)], dtype=np.float64), x_label="log(x)", y_label="$\\log(c_q(x)) / q$", @@ -1919,7 +1951,7 @@ def hhcorr(d: int, q: float) -> float: return H -def _genhurst(S: np.ndarray[tuple[int], np.dtype[np.float64]], q: float) -> float: +def _genhurst(S: FloatArray1D, q: float) -> float: """Computes the generalized hurst exponent H_q for time series S. This function should not be used. It is only kept here to demonstrate that @@ -2006,9 +2038,9 @@ def _genhurst(S: np.ndarray[tuple[int], np.dtype[np.float64]], q: float) -> floa def _aste_line_fit( - x: np.typing.IntArrayLike | np.typing.FloatArrayLike, - y: np.typing.IntArrayLike | np.typing.FloatArrayLike, -) -> list[float]: + x: NumberArrayLike1D, + y: NumberArrayLike1D, +) -> list[np.floating]: """Simple linear regression with ordinary least squares. See https://en.wikipedia.org/wiki/Simple_linear_regression. @@ -2055,9 +2087,9 @@ def _aste_line_fit( @overload def mfhurst_dm( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - max_dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, *, detrend: bool = True, fit: FittingMethod = "poly", @@ -2065,15 +2097,16 @@ def mfhurst_dm( debug_data: Literal[False] = False, plot_file: str | Path | None = None, ) -> tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray1D, + FloatArray1D, ]: ... @overload def mfhurst_dm( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - max_dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, *, detrend: bool = True, fit: FittingMethod = "poly", @@ -2081,20 +2114,20 @@ def mfhurst_dm( debug_data: Literal[True] = True, plot_file: str | Path | None = None, ) -> tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int, int], np.dtype[np.float64]], + FloatArray1D, + FloatArray2D, + FloatArray2D, ], ]: ... def mfhurst_dm( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - qvals: np.typing.FloatArrayLike | None = None, - max_dists: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + qvals: FloatArrayLike1D | None = None, + max_dists: IntArrayLike1D | None = None, *, detrend: bool = True, fit: FittingMethod = "poly", @@ -2103,15 +2136,16 @@ def mfhurst_dm( plot_file: str | Path | None = None, ) -> ( tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], np.ndarray[tuple[int], np.dtype[np.float64]] + FloatArray1D, + FloatArray1D, ] | tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int, int], np.dtype[np.float64]], + FloatArray1D, + FloatArray2D, + FloatArray2D, ], ] ): @@ -2229,7 +2263,12 @@ def mfhurst_dm( if detrend: stepdata = detrend_data(stepdata, order=1) diffs = stepdata[1:] - stepdata[:-1] - hhcorr.append([np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) for q in qvals]) + hhcorr.append( + [ + np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) + for q in qvals + ] + ) hhcorr = np.array(hhcorr, dtype=np.float64) xvals = np.log(np.arange(1, max_max_dist + 1)) yvals = np.log(hhcorr) @@ -2251,7 +2290,9 @@ def mfhurst_dm( ) plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), - np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), + np.array( + [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64 + ), polys, x_label="log(x)", y_label="$\\log(c_q(x)) / q$", @@ -2268,16 +2309,16 @@ def mfhurst_dm( @overload def corr_dim( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + data: NumberArrayLike1D, emb_dim: int = 2, lag: int = 1, - rvals: np.typing.FloatArrayLike | None = None, + rvals: FloatArrayLike1D | None = None, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_euclidean, fit: FittingMethod = "RANSAC", *, @@ -2289,16 +2330,16 @@ def corr_dim( @overload def corr_dim( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + data: NumberArrayLike1D, emb_dim: int = 2, lag: int = 1, - rvals: np.typing.FloatArrayLike | None = None, + rvals: FloatArrayLike1D | None = None, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_euclidean, fit: FittingMethod = "RANSAC", *, @@ -2308,24 +2349,24 @@ def corr_dim( ) -> tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ]: ... def corr_dim( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, + data: NumberArrayLike1D, emb_dim: int = 2, lag: int = 1, - rvals: np.typing.FloatArrayLike | None = None, + rvals: FloatArrayLike1D | None = None, dist: Callable[ [ - np.ndarray[tuple[int, int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray2D, + FloatArray1D, ], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, ] = rowwise_euclidean, fit: FittingMethod = "RANSAC", *, @@ -2337,9 +2378,9 @@ def corr_dim( | tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ] ): @@ -2424,11 +2465,11 @@ def corr_dim( - csums: the corresponding log(C(r)) - poly: the line coefficients (``[slope, intercept]``) """ - # TODO determine lag in units of time instead of number of datapoints + # TODO: determine lag in units of time instead of number of datapoints data = np.asarray(data) - # TODO what are good values for r? - # TODO do this for multiple values of emb_dim? + # TODO: what are good values for r? + # TODO: do this for multiple values of emb_dim? if rvals is None: sd = float(np.std(data, ddof=1)) rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) @@ -2466,17 +2507,21 @@ def corr_dim( else: poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) if debug_plot: - plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file) + plot_reg( + np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file + ) if debug_data: return (poly[0], (np.log(rvals), np.log(csums), poly)) return poly[0] def detrend_data( - data: np.ndarray[tuple[int], np.dtype[np.float64]], order: int = 1, fit: FittingMethod = "poly" -) -> np.ndarray[tuple[int], np.dtype[np.float64]]: + data: FloatArray1D, + order: int = 1, + fit: FittingMethod = "poly", +) -> FloatArray1D: """Removes a trend of given order from the data.""" - # TODO also use this function in dfa + # TODO: also use this function in dfa xvals = np.arange(len(data)) trend = poly_fit(xvals, data, order, fit=fit) return data - np.polyval(trend, xvals) @@ -2484,8 +2529,8 @@ def detrend_data( @overload def dfa( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, *, overlap: bool = True, order: int = 1, @@ -2499,8 +2544,8 @@ def dfa( @overload def dfa( - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, *, overlap: bool = True, order: int = 1, @@ -2512,16 +2557,16 @@ def dfa( ) -> tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ]: ... def dfa( # noqa: C901, PLR0912, PLR0915 - data: np.typing.IntArrayLike | np.typing.FloatArrayLike, - nvals: np.typing.IntArrayLike | None = None, + data: NumberArrayLike1D, + nvals: IntArrayLike1D | None = None, *, overlap: bool = True, order: int = 1, @@ -2535,9 +2580,9 @@ def dfa( # noqa: C901, PLR0912, PLR0915 | tuple[ float, tuple[ - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], - np.ndarray[tuple[int], np.dtype[np.float64]], + FloatArray1D, + FloatArray1D, + FloatArray1D, ], ] ): @@ -2754,7 +2799,14 @@ def dfa( # noqa: C901, PLR0912, PLR0915 else: poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, fit=fit_exp) if debug_plot: - plot_reg(np.log(nvals), np.log(fluctuations), poly, "log(n)", "std(X,n)", fname=plot_file) + plot_reg( + np.log(nvals), + np.log(fluctuations), + poly, + "log(n)", + "std(X,n)", + fname=plot_file, + ) if debug_data: return (poly[0], (np.log(nvals), np.log(fluctuations), poly)) return poly[0] From 096335a5a30f80a25652b0eeaae25ce025d17af5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sat, 9 Aug 2025 19:35:25 +0200 Subject: [PATCH 20/36] fixes a few extra linting errors --- nolds/datasets.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index c903066..645ace6 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -88,7 +88,7 @@ def lorenz_lyap(sigma: float, rho: float, beta: float) -> float: def fbm( n: int, - H: float = 0.75, # noqa: N803 + H: float = 0.75, random_seed: int | None = None, ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Generates fractional brownian motions of desired length. @@ -115,13 +115,13 @@ def R( # noqa: N802 t: np.ndarray[tuple[int], np.dtype[np.float64]], s: np.ndarray[tuple[int], np.dtype[np.float64]], ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: - twoH = 2 * H # noqa: N806 + twoH = 2 * H return 0.5 * (s**twoH + t**twoH - np.abs(t - s) ** twoH) # form the matrix tau gamma = R(*np.mgrid[0:n, 0:n]) # apply R to every element in matrix - w, P = np.linalg.eigh(gamma) # noqa: N806 - L = np.diag(w) # noqa: N806 + w, P = np.linalg.eigh(gamma) + L = np.diag(w) sigma = np.dot(np.dot(P, np.sqrt(L)), np.linalg.inv(P)) gen = np.random.default_rng(seed=random_seed) v = gen.standard_normal(n) @@ -130,7 +130,7 @@ def R( # noqa: N802 def fgn( n: int, - H: float = 0.75, # noqa: N803 + H: float = 0.75, random_seed: int | None = None, ) -> np.ndarray[tuple[int], np.dtype[np.float64]]: """Generates fractional gaussian noise of desired length. @@ -226,7 +226,7 @@ def load_lorenz_physionet() -> tuple[ return data_in, data_out -def tent_map(x: float, steps: int, mu: int = 2) -> Generator[float, None, None]: +def tent_map(x: float, steps: int, mu: float = 2) -> Generator[float, None, None]: """Generates a time series of the tent map. Characteristics and Background: From 284b549882590b274eda6bd336823b4b698dde73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sat, 9 Aug 2025 20:32:27 +0200 Subject: [PATCH 21/36] modernizes examples --- nolds/examples.py | 330 ++++++++++++++++++++++++---------------------- 1 file changed, 172 insertions(+), 158 deletions(-) diff --git a/nolds/examples.py b/nolds/examples.py index 91e443d..b66b2d9 100644 --- a/nolds/examples.py +++ b/nolds/examples.py @@ -1,12 +1,24 @@ +"""Example use cases for measures implemented in nolds. + +The functions in this module aim to recreate experiments published in literature. +""" + +from __future__ import annotations + +import argparse +from typing import TYPE_CHECKING, Literal + import numpy as np from . import datasets from . import measures as nolds +if TYPE_CHECKING: + from nolds.measures import FloatArray1D, FloatArray2D, IntArray1D, NumberArrayLike1D -def weron_2002_figure2(n=10000) -> None: - """Recreates figure 2 of [w]_ comparing the reported values by Weron to the - values obtained by the functions in this package. + +def weron_2002_figure2(n: int = 10000) -> None: + """Recreates figure 2 of Weron 2002 ([w]_). The experiment consists of n iterations where the hurst exponent of randomly generated gaussian noise is calculated. This is done with differing sequence @@ -33,10 +45,9 @@ def weron_2002_figure2(n=10000) -> None: properties and confidence intervals,” Physica A: Statistical Mechanics and its Applications, vol. 312, no. 1, pp. 285–299, 2002. - Kwargs: - n (int): - number of iterations of the experiment (Weron used 10000, but this takes - a while) + Args: + n: number of iterations of the experiment (Weron used 10000, but this takes + a while) """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt @@ -46,23 +57,31 @@ def weron_2002_figure2(n=10000) -> None: reported = [6.708, 13.103, 20.240, 21.924, 22.256, 24.112, 24.054, 26.299, 26.897] reported_raw = [160.599, 141.663, 128.454, 115.617, 103.651, 95.481, 86.810, 81.799, 76.270] - def height_to_h(height): + def height_to_h( + height: FloatArray1D, + ) -> FloatArray1D: + """Returns Hurst exponent for specific height coordinates in the inkscape file. + + Args: + height: The height coordinates from the inkscape file. + """ return 0.49 + height / 29.894 * 0.01 reported = height_to_h(np.array(reported)) reported_raw = height_to_h(np.array(reported_raw)) + rng = np.random.default_rng(3897509205) data = [] for e in range(8, 17): - l = 2**e + length = 2**e nvals = 2 ** np.arange(6, e) - rsn = np.mean([nolds.hurst_rs(np.random.normal(size=l), fit="poly") for _ in range(n)]) + rsn = np.mean([nolds.hurst_rs(rng.normal(size=length), fit="poly") for _ in range(n)]) rs50 = np.mean( - [nolds.hurst_rs(np.random.normal(size=l), fit="poly", nvals=nvals) for _ in range(n)] + [nolds.hurst_rs(rng.normal(size=length), fit="poly", nvals=nvals) for _ in range(n)] ) rs50_raw = np.mean( [ nolds.hurst_rs( - np.random.normal(size=l), + rng.normal(size=length), fit="poly", nvals=nvals, corrected=False, @@ -75,33 +94,34 @@ def height_to_h(height): r = plt.plot(np.arange(8, 17), reported) rr = plt.plot(np.arange(8, 17), reported_raw) plt.legend(r + rr + lines, ("weron", "weron_raw", "rsn", "rs50", "rs50_raw")) - plt.xticks(np.arange(8, 17), 2 ** np.arange(8, 17)) + plt.xticks(np.arange(8, 17), [str(x) for x in 2 ** np.arange(8, 17)]) plt.xlabel("sequence length") plt.ylabel("estimated hurst exponent") plt.show() def plot_hurst_hist() -> None: - """Plots a histogram of values obtained for the hurst exponent of uniformly - distributed white noise. + """Plots a histogram of values obtained for the hurst exponent of white noise. This function requires the package ``matplotlib``. """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt - hs = [nolds.hurst_rs(np.random.random(size=10000), corrected=True) for _ in range(100)] + rng = np.random.default_rng(869879538) + hs = [nolds.hurst_rs(rng.random(size=10000), corrected=True) for _ in range(100)] plt.hist(hs, bins=20) plt.xlabel("esimated value of hurst exponent") plt.ylabel("number of experiments") plt.show() -def plot_lyap(maptype="logistic") -> None: - """Plots a bifurcation plot of the given map and superimposes the true - lyapunov exponent as well as the estimates of the largest lyapunov exponent - obtained by ``lyap_r`` and ``lyap_e``. The idea for this plot is taken - from [ll]_. +def plot_lyap(maptype: Literal["logistic", "tent"] = "logistic") -> None: + """Creates a bifurcation plot of the given map and its lyapunov exponents. + + This superimposes the true lyapunov exponent as well as the estimates of the + largest lyapunov exponent obtained by ``lyap_r`` and ``lyap_e``. + The idea for this plot is taken from [ll]_. This function requires the package ``matplotlib``. @@ -110,10 +130,8 @@ def plot_lyap(maptype="logistic") -> None: .. [ll] Manfred Füllsack, "Lyapunov exponent", url: http://systems-sciences.uni-graz.at/etextbook/sw2/lyapunov.html - Kwargs: - maptype (str): - can be either ``"logistic"`` for the logistic map or ``"tent"`` for the - tent map. + Args: + maptype: can be either ``"logistic"`` for the logistic map or ``"tent"`` for the tent map. """ # local import to avoid dependency for non-debug use import matplotlib.pyplot as plt @@ -126,7 +144,7 @@ def plot_lyap(maptype="logistic") -> None: param_range = np.arange(2, 4, 0.01) full_data = np.array( [ - np.fromiter(datasets.logistic_map(x_start, n, r), dtype="float32") + np.fromiter(datasets.logistic_map(x_start, n, float(r)), dtype="float32") for r in param_range ] ) @@ -138,15 +156,19 @@ def plot_lyap(maptype="logistic") -> None: # https://blog.abhranil.net/2015/05/15/lyapunov-exponent-of-the-logistic-map-mathematica-code/ # Derivative of logistic map: f(x) = r * x * (1 - x) = r * x - r * x² # => f'(x) = r - 2 * r * x + x_0 = 0.5 # avoid zero crossings in f'(x) lambdas = [ - np.mean(np.log(abs(r - 2 * r * x[np.where(x != 0.5)]))) + np.mean(np.log(abs(r - 2 * r * x[np.where(x != x_0)]))) for x, r in zip(full_data, param_range) ] elif maptype == "tent": param_name = "$\\mu$" param_range = np.arange(0, 2, 0.01) - full_data = np.array( - [np.fromiter(datasets.tent_map(x_start, n, mu), dtype="float32") for mu in param_range] + full_data: FloatArray2D = np.array( + [ + np.fromiter(datasets.tent_map(x_start, n, float(mu)), dtype="float32") + for mu in param_range + ] ) # for the tent map the lyapunov exponent is much easier to calculate # since the values are multiplied by mu in each step, two trajectories @@ -156,12 +178,12 @@ def plot_lyap(maptype="logistic") -> None: lambdas[np.where(param_range <= 0)] = np.nan else: msg = f"maptype {maptype} not recognized" - raise Error(msg) + raise ValueError(msg) kwargs_e = {"emb_dim": 6, "matrix_dim": 2} kwargs_r = {"emb_dim": 6, "lag": 2, "min_tsep": 20, "trajectory_len": 20} - lambdas_e = [max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] - lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] + lambdas_e = [np.max(nolds.lyap_e(d, **kwargs_e)) for d in full_data] # pyright: ignore reportCallIssue + lambdas_r = [nolds.lyap_r(d, **kwargs_r) for d in full_data] # pyright: ignore reportCallIssue bifur_x = np.repeat(param_range, nbifur) bifur = np.reshape(full_data[:, -nbifur:], nbifur * param_range.shape[0]) @@ -181,29 +203,24 @@ def plot_lyap(maptype="logistic") -> None: def profiling() -> None: - """Runs a profiling test for the function ``lyap_e`` (mainly used for - development). + """Runs a profiling test for the function ``lyap_e``. - This function requires the package ``cProfile``. + This function is mainly used for development and requires the package ``cProfile``. """ import cProfile n = 10000 - data = np.cumsum(np.random.random(n) - 0.5) + rng = np.random.default_rng(2030628104) + data = np.cumsum(rng.random(n) - 0.5) cProfile.runctx("lyap_e(data)", {"lyap_e": nolds.lyap_e}, {"data": data}) -def hurst_compare_nvals(data, nvals=None) -> None: - """Creates a plot that compares the results of different choices for nvals - for the function hurst_rs. +def hurst_compare_nvals(data: NumberArrayLike1D, nvals: IntArray1D | None = None) -> None: + """Creates a plot that compares the results of different nvals for the function hurst_rs. Args: - data (array-like of float): - the input data from which the hurst exponent should be estimated - - Kwargs: - nvals (array of int): - a manually selected value for the nvals parameter that should be plotted + data: the input data from which the hurst exponent should be estimated + nvals: a manually selected value for the nvals parameter that should be plotted in comparison to the default choices """ import matplotlib.pyplot as plt @@ -216,7 +233,8 @@ def hurst_compare_nvals(data, nvals=None) -> None: n_div = n_all[np.where(len(data) % n_all[:-1] == 0)] dd_div = nolds.hurst_rs(data, nvals=n_div, debug_data=True, fit="poly") - def corr(nvals): + def corr(nvals: IntArray1D) -> list[np.float64]: + """Calculacte correction offset using expected_rs.""" return [np.log(nolds.expected_rs(n)) for n in nvals] l_all = plt.plot(dd_all[1][0], dd_all[1][1] - corr(n_all), "o") @@ -235,31 +253,16 @@ def corr(nvals): l_all + l_def + l_div + l_cst, ["all", "default", "divisors", *t_cst], ) - labeled_data = zip([dd_all[0], dd_def[0], dd_div[0]], ["all", "def", "div"]) - for data, _label in labeled_data: - pass - if nvals is not None: - pass plt.show() -def sampen_default_tolerance() -> None: - data = list(datasets.logistic_map(0.34, 1000, r=3.9)) - oldtol = 0.2 * np.std(data, ddof=1) - old_res = [nolds.sampen(data, emb_dim=i, tolerance=oldtol) for i in range(1, 30)] - new_res = [nolds.sampen(data, emb_dim=i) for i in range(1, 30)] - for _i, _old, _new in zip(range(1, 30), old_res, new_res): - pass - - -def aste_line_fitting(N=100) -> None: - """Shows plot that proves that the line fitting in T. Astes original MATLAB code - provides the same results as `np.polyfit`. - """ - slope = np.random.random() * 10 - 5 - intercept = np.random.random() * 100 - 50 +def aste_line_fitting(N: int = 100) -> None: + """Proves equivalence of T. Aste's original MATLAB code and `np.polyfit`.""" + rng = np.random.default_rng(528714688) + slope = rng.random() * 10 - 5 + intercept = rng.random() * 100 - 50 xvals = np.arange(N) - yvals = xvals * slope + intercept + np.random.randn(N) * 100 + yvals = xvals * slope + intercept + rng.standard_normal(N) * 100 import matplotlib.pyplot as plt plt.plot(xvals, yvals, "rx", label="data") @@ -270,7 +273,7 @@ def aste_line_fitting(N=100) -> None: label=f"true ({slope:.3f} x + {intercept:.3f})", alpha=0.5, ) - i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) + i_aste, s_aste = nolds._aste_line_fit(xvals, yvals) # noqa: SLF001 s_np, i_np = np.polyfit(xvals, yvals, 1) plt.plot( [0, N - 1], @@ -290,9 +293,8 @@ def aste_line_fitting(N=100) -> None: plt.show() -def hurst_mf_stock(debug=False) -> None: - """Recreates results from [mfs_1]_ (table at start of section 4) as print - output. +def hurst_mf_stock(*, debug: bool = False) -> None: + """Recreates results from [mfs_1]_ (table at start of section 4) as print output. Unfortunately as a layman in finance, I could not determine the exact data that Di Matteo et al. used. Instead I use the data from @@ -317,13 +319,12 @@ def hurst_mf_stock(debug=False) -> None: References: - .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors - in differently developed markets,” Physica A: Statistical Mechanics - and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. + .. [mfs_1] T. Di Matteo, T. Aste, and M. M. Dacorogna, “Scaling behaviors + in differently developed markets,” Physica A: Statistical Mechanics + and its Applications, vol. 324, no. 1–2, pp. 183–188, 2003. - Kwargs: - debug (boolean): - if `True`, a debug plot will be shown for each calculated GHE value + Args: + debug: if `True`, a debug plot will be shown for each calculated GHE value except for the ones generated by `_genhurst`. """ financial = [ @@ -332,18 +333,18 @@ def hurst_mf_stock(debug=False) -> None: (datasets.ndx, "ndx"), ] for data, _lab in financial: - data = data[1][:, 0] - data = np.log(data) + timeseries = data[1][:, 0] + timeseries = np.log(timeseries) dists = range(1, 20) - nolds.mfhurst_b(data, qvals=[2], dists=dists, debug_plot=debug)[0] + nolds.mfhurst_b(timeseries, qvals=[2], dists=dists, debug_plot=debug)[0] nolds.mfhurst_b( - nolds.detrend_data(data, order=1), + nolds.detrend_data(timeseries, order=1), qvals=[2], dists=dists, debug_plot=debug, )[0] - nolds.mfhurst_dm(data, qvals=[2], debug_plot=debug)[0][0] - nolds._genhurst(data, 2) + nolds.mfhurst_dm(timeseries, qvals=[2], debug_plot=debug)[0][0] + nolds._genhurst(timeseries, 2) # noqa: SLF001 def barabasi_1991_figure2() -> None: @@ -354,8 +355,8 @@ def barabasi_1991_figure2() -> None: with b1 = 0.8 and b2 = 0.5. References: - .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + .. [bf2] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. """ import matplotlib.pyplot as plt @@ -365,7 +366,6 @@ def barabasi_1991_figure2() -> None: b1 = 0.8 b2 = 0.5 dists = [4**i for i in range(6, 11)] - # dists = nolds.logarithmic_n(100, 0.01 * len(b1991), 2) Hq = nolds.mfhurst_b(b1991, qvals=qvals, dists=dists) Hq_t = [np.log((b1**q + b2**q) / 2) / np.log(0.25) / q for q in qvals_t] plt.plot(qvals, Hq, "r+", label="mfhurst_b") @@ -383,16 +383,16 @@ def barabasi_1991_figure3() -> None: Brownian motion that moves in unit steps (-1 or +1) in each time step. References: - .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine - fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. + .. [bf3] A.-L. Barabási and T. Vicsek, “Multifractality of self-affine + fractals,” Physical Review A, vol. 44, no. 4, pp. 2730–2733, 1991. """ import matplotlib.pyplot as plt - brown = np.cumsum(np.random.randint(0, 2, size=10000000) * 2 - 1) + rng = np.random.default_rng(2562651293) + brown = np.cumsum(rng.integers(0, 2, size=10000000) * 2 - 1) qvals = [-5, -4, -3, -2, -1.1, 0.1, 1, 2, 3, 4, 5] Hq_t = [0.5 if q > -1 else -0.5 / q for q in qvals] dists = [2**i for i in range(6, 15)] - # dists = nolds.logarithmic_n(100, 0.01 * len(brown), 1.5) Hq = nolds.mfhurst_b(brown, qvals=qvals, dists=dists, debug_plot=False) plt.plot(qvals, Hq, "r+", label="mfhurst_b") plt.plot(qvals, Hq_t, label="calculated value") @@ -404,7 +404,9 @@ def barabasi_1991_figure3() -> None: def lorenz() -> None: - """Calculates different measures for the Lorenz system of ordinary + """Compares nolds results with prescribed results for Lorenz system. + + Calculates different measures for the Lorenz system of ordinary differential equations and compares nolds results with prescribed results from the literature. @@ -438,46 +440,39 @@ def lorenz() -> None: References: - .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness - of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, - no. 1, pp. 189–208, 1983. - .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, - “The effect of time delay on Approximate & Sample Entropy - calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, - pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. - .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series - Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, - 2009, doi: 10.1007/s11207-009-9467-x. - .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and - similarities in the analysis of Lorenz, Chen, and Lu systems,” - Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, - doi: 10.1016/j.amc.2014.12.132. - .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, - and M. Dietz, “A Multivariate Method for Dynamic System Analysis: - Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” - Topics in Cognitive Science, p. tops.12688, Sep. 2023, - doi: 10.1111/tops.12688. - - + .. [l_1] P. Grassberger and I. Procaccia, “Measuring the strangeness + of strange attractors,” Physica D: Nonlinear Phenomena, vol. 9, + no. 1, pp. 189–208, 1983. + .. [l_2] F. Kaffashi, R. Foglyano, C. G. Wilson, and K. A. Loparo, + “The effect of time delay on Approximate & Sample Entropy + calculations,” Physica D: Nonlinear Phenomena, vol. 237, no. 23, + pp. 3069–3074, 2008, doi: 10.1016/j.physd.2008.06.005. + .. [l_3] V. Suyal, A. Prasad, and H. P. Singh, “Nonlinear Time Series + Analysis of Sunspot Data,” Sol Phys, vol. 260, no. 2, pp. 441–449, + 2009, doi: 10.1007/s11207-009-9467-x. + .. [l_4] G. A. Leonov and N. V. Kuznetsov, “On differences and + similarities in the analysis of Lorenz, Chen, and Lu systems,” + Applied Mathematics and Computation, vol. 256, pp. 334–343, 2015, + doi: 10.1016/j.amc.2014.12.132. + .. [l_5] S. Wallot, J. P. Irmer, M. Tschense, N. Kuznetsov, A. Højlund, + and M. Dietz, “A Multivariate Method for Dynamic System Analysis: + Multivariate Detrended Fluctuation Analysis Using Generalized Variance,” + Topics in Cognitive Science, p. tops.12688, Sep. 2023, + doi: 10.1111/tops.12688. """ sigma = 10 rho = 28 beta = 8.0 / 3 - start = [0, 22, 10] + start = [0.0, 22.0, 10.0] n = 10000 skip = 10000 dt = 0.012 data = datasets.lorenz_euler(n + skip, sigma, rho, beta, start=start, dt=dt)[skip:] - # fig = plt.figure() - # ax = fig.add_subplot(111, projection="3d") - # ax.plot(data[:, 0], data[:, 1], data[:, 2]) - # plt.show() - # plt.close(fig) - datasets.lorenz_lyap(sigma, rho, beta) # Rationale for argument values: - # start with medium settings for min_tsep and lag, span a large area with trajectory_len, set fit_offset to 0 + # start with medium settings for min_tsep and lag, span a large area with trajectory_len, + # set fit_offset to 0 # up the embedding dimension until you get a clear line in the debug plot # adjust trajectory_len and fit_offset to split off only the linear part # in general: the longer the linear part of the plot, the better @@ -534,7 +529,7 @@ def lorenz() -> None: data_dfa = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ 20000: ] - nvals = nolds.logarithmic_n(200, len(data_dfa) / 8, 2**0.2) + nvals = nolds.logarithmic_n(200, np.ceil(len(data_dfa) / 8), 2**0.2) dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} nolds.dfa(data_dfa[:, 0], **dfa_args) nolds.dfa(data_dfa[:, 1], **dfa_args) @@ -543,45 +538,64 @@ def lorenz() -> None: # reference: Kaffashi 2008 # Rationale for argument values: Just follow paper. sampen_args = {"emb_dim": 2, "lag": 1} - nolds.sampen(data[:, 0], **sampen_args) - nolds.sampen(data[:, 1], **sampen_args) - nolds.sampen(data[:, 2], **sampen_args) + nolds.sampen(data[:, 0], **sampen_args) # pyright: ignore reportCallIssue + nolds.sampen(data[:, 1], **sampen_args) # pyright: ignore reportCallIssue + nolds.sampen(data[:, 2], **sampen_args) # pyright: ignore reportCallIssue if __name__ == "__main__": # run this with the following command: # python -m nolds.examples lyapunov-logistic - import sys - - def print_options() -> None: - pass - - if len(sys.argv) < 2: - print_options() - elif sys.argv[1] == "lyapunov-logistic": - plot_lyap() - elif sys.argv[1] == "lyapunov-tent": - plot_lyap("tent") - elif sys.argv[1] == "profiling": - profiling() - elif sys.argv[1] == "hurst-weron2": - n = 1000 if len(sys.argv) < 3 else int(sys.argv[2]) - weron_2002_figure2(n) - elif sys.argv[1] == "hurst-hist": - plot_hurst_hist() - elif sys.argv[1] == "hurst-nvals": - hurst_compare_nvals(datasets.brown72) - elif sys.argv[1] == "sampen-tol": - sampen_default_tolerance() - elif sys.argv[1] == "aste-line": - aste_line_fitting() - elif sys.argv[1] == "hurst-mf-stock": - hurst_mf_stock() - elif sys.argv[1] == "hurst-mf-barabasi2": - barabasi_1991_figure2() - elif sys.argv[1] == "hurst-mf-barabasi3": - barabasi_1991_figure3() - elif sys.argv[1] == "lorenz": - lorenz() - else: - print_options() + + parser = argparse.ArgumentParser( + prog="nolds.examples", description="Run examples for nolds metrics." + ) + parser.add_argument( + "example", + choices=[ + "lyapunov-logistic", + "lyapunov-tent", + "profiling", + "hurst-weron2", + "hurst-hist", + "hurst-nvals", + "aste-line", + "hurst-mf-stock", + "hurst-mf-barabasi2", + "hurst-mf-barabasi3", + "lorenz", + ], + help="Which example to run.", + ) + parser.add_argument( + "--size", + "-n", + default=None, + type=int, + help="number of iterations or datapoints to use for certain examples", + ) + args = parser.parse_args() + + match args.example: + case "lyapunov-logistic": + plot_lyap() + case "lyapunov-tent": + plot_lyap("tent") + case "profiling": + profiling() + case "hurst-weron2": + weron_2002_figure2(1000 if args.size is None else args.size) + case "hurst-hist": + plot_hurst_hist() + case "hurst-nvals": + hurst_compare_nvals(datasets.brown72) + case "aste-line": + aste_line_fitting(100 if args.size is None else args.size) + case "hurst-mf-stock": + hurst_mf_stock() + case "hurst-mf-barabasi2": + barabasi_1991_figure2() + case "hurst-mf-barabasi3": + barabasi_1991_figure3() + case "lorenz": + lorenz() From 3439416d194fb04498cf0956cf72c1de68996695 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 24 Aug 2025 21:55:06 +0200 Subject: [PATCH 22/36] modernizes tests (WIP) --- nolds/test_measures.py | 367 ++++++++++++++++++++++++----------------- 1 file changed, 215 insertions(+), 152 deletions(-) diff --git a/nolds/test_measures.py b/nolds/test_measures.py index b022a9b..618e7c2 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -1,12 +1,17 @@ +"""Unit tests for main measures of measures.""" + +from __future__ import annotations + import unittest import warnings -import numpy as np +# from numpy.testing import assert_equal as assert_array_equal +from typing import Any -from nolds import datasets +import numpy as np +from numpy.typing import ArrayLike, DTypeLike -# import internal module to test helping functions -from nolds import measures as nolds +from nolds import datasets, measures # TODO add tests for mfhurst_b and mfhurst_dm @@ -25,14 +30,34 @@ class TestNoldsHelperFunctions(unittest.TestCase): """Tests for internal helper functions that are not part of the public API.""" - def assert_array_equals(self, expected, actual, print_arrays=False) -> None: - if print_arrays: - pass - assert np.all(actual == expected) + def assert_array_equal( + self, + expected: ArrayLike, + actual: ArrayLike, + dtype: DTypeLike = np.float64, + ) -> None: + """Test that two arrays are exactly equal. + + Args: + expected: The expected result + actual: The actual result + dtype: dtype of the arrays to compare + """ + expected = np.asarray(expected, dtype=dtype) + actual = np.asarray(actual, dtype=dtype) + diff_indices = np.array((actual != expected).nonzero()).transpose() + first_diff = diff_indices[0] if diff_indices.shape[0] > 0 else None + first_diff_msg = f"{expected[first_diff]} != {actual[first_diff]} at {first_diff}" + msg = ( + f"Arrays differ!\n\nExpected:\n{expected}\n\nActual:\n{actual}" + f"\n\nFirst difference:\n{first_diff_msg}" + ) + assert np.all(actual == expected), msg def test_delay_embed_lag2(self) -> None: - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=2) + """Hypothesis: Setting a lag of 2 skips every second element in orbit vectors.""" + data = np.arange(10, dtype=np.float64) + embedded = measures.delay_embedding(data, 4, lag=2) expected = np.array( [ [0, 2, 4, 6], @@ -40,47 +65,50 @@ def test_delay_embed_lag2(self) -> None: [2, 4, 6, 8], [3, 5, 7, 9], ], - dtype="float32", + dtype=np.float64, ) - self.assert_array_equals(expected, embedded) + self.assert_array_equal(expected, embedded) def test_delay_embed(self) -> None: - data = np.arange(6, dtype="float32") - embedded = nolds.delay_embedding(data, 4) + """Hypothesis: Default settings produce consecutive slices of same length.""" + data = np.arange(6, dtype=np.float64) + embedded = measures.delay_embedding(data, 4) expected = np.array( [ [0, 1, 2, 3], [1, 2, 3, 4], [2, 3, 4, 5], ], - dtype="float32", + dtype=np.float64, ) - self.assert_array_equals(expected, embedded) + self.assert_array_equal(expected, embedded) def test_delay_embed_lag3(self) -> None: - data = np.arange(10, dtype="float32") - embedded = nolds.delay_embedding(data, 4, lag=3) + """Hypothesis: Setting a lag of 3 only takes every third element.""" + data = np.arange(10, dtype=np.float64) + embedded = measures.delay_embedding(data, 4, lag=3) expected = np.array( [ [0, 3, 6, 9], ], - dtype="float32", + dtype=np.float64, ) - self.assert_array_equals(expected, embedded) + self.assert_array_equal(expected, embedded) def test_delay_embed_empty(self) -> None: - data = np.arange(10, dtype="float32") + """Hpoythesis: An error is raised when settings would lead to an empty orbit vector list.""" + data = np.arange(10, dtype=np.float64) try: - embedded = nolds.delay_embedding(data, 11) + embedded = measures.delay_embedding(data, 11) msg = ( "embedding array of size 10 with embedding dimension 11 should fail, got {} instead" ) self.fail(msg.format(embedded)) except ValueError: pass - data = np.arange(10, dtype="float32") + data = np.arange(10, dtype=np.float64) try: - embedded = nolds.delay_embedding(data, 4, lag=4) + embedded = measures.delay_embedding(data, 4, lag=4) msg = ( "embedding array of size 10 with embedding dimension 4 and " "lag 4 should fail, got {} instead" @@ -94,19 +122,23 @@ class TestNoldsUtility(unittest.TestCase): """Tests for small utility functions that are part of the public API.""" def test_binary_n(self) -> None: - x = nolds.binary_n(1000, min_n=50) + """Hypothesis: binary_n produces exponentially declining numbers.""" + x = measures.binary_n(1000, min_n=50) self.assertSequenceEqual(x, [500, 250, 125, 62]) def test_binary_n_empty(self) -> None: - x = nolds.binary_n(50, min_n=50) + """Hypothesis: binary_n gives empty output if min_n is set too high.""" + x = measures.binary_n(50, min_n=50) self.assertSequenceEqual(x, []) def test_logarithmic_n(self) -> None: - x = nolds.logarithmic_n(4, 11, 1.51) + """Hypothesis: logarithmic_n outputs integers that follow an exponential series.""" + x = measures.logarithmic_n(4, 11, 1.51) self.assertSequenceEqual(x, [4, 6, 9]) def test_logarithmic_r(self) -> None: - x = nolds.logarithmic_r(4, 10, 1.51) + """Hypothesis: logarithmic_r outputs floats that follow an exponential series.""" + x = measures.logarithmic_r(4, 10, 1.51) self.assertSequenceEqual(x, [4, 6.04, 9.1204]) @@ -114,11 +146,13 @@ class TestNoldsLyap(unittest.TestCase): """Tests for lyap_e and lyap_r.""" def test_lyap_logistic(self) -> None: + """Hypothesis: The output of lyap_e and lyap_r on a logistic map has the correct sign.""" rvals = [2.5, 3.4, 3.7, 4.0] sign = [-1, -1, 1, 1] x0 = 0.1 - def logistic(x, r): + def logistic(x: float, r: float) -> float: + """Logistic map.""" return r * x * (1 - x) for r, s in zip(rvals, sign): @@ -127,14 +161,16 @@ def logistic(x, r): for _ in range(100): x = logistic(x, r) log.append(x) - log = np.array(log, dtype="float32") - le = np.max(nolds.lyap_e(log, emb_dim=6, matrix_dim=2)) - lr = nolds.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) - assert s == int(np.sign(le)), f"r = {r}" - assert s == int(np.sign(lr)), f"r = {r}" + log = np.array(log, dtype=np.float64) + with self.subTest(meashure="lyap_e", r=r): + le = np.max(measures.lyap_e(log, emb_dim=6, matrix_dim=2)) + self.assertEqual(s, np.sign(le)) + with self.subTest(meashure="lyap_r", r=r): + lr = measures.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) + self.assertEqual(s, np.sign(lr)) def test_lyap_lorenz(self) -> None: - """Test hypothesis: Both lyap_r and lyap_e can reconstruct the largest Lyapunov exponent of the Lorenz system. + """Hypothesis: lyap_r and lyap_e match expected values for the Lorenz system. The parameters for generating the Lorenz system were chosen to be as close as possible to the experiments performed by Leonov and Kuznetsov (see [l_4]_) @@ -158,66 +194,87 @@ def test_lyap_lorenz(self) -> None: "fit_offset": 8, "fit": "poly", } - lyap_rx = nolds.lyap_r(data[:, 0], **lyap_r_args) - lyap_ry = nolds.lyap_r(data[:, 1], **lyap_r_args) - lyap_rz = nolds.lyap_r(data[:, 2], **lyap_r_args) lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": 0.01, "min_nb": 8} - lyap_ex = nolds.lyap_e(data[:, 0], **lyap_e_args) - lyap_ey = nolds.lyap_e(data[:, 1], **lyap_e_args) - lyap_ez = nolds.lyap_e(data[:, 2], **lyap_e_args) - self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) - self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) - self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) - assert lyap_ex[0] > 1.5 - assert lyap_ey[0] > 1.5 - assert lyap_ez[0] > 1.5 + with self.subTest(measure="lyap_r", axis="x"): + lyap_rx = measures.lyap_r(data[:, 0], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) + with self.subTest(measure="lyap_r", axis="y"): + lyap_ry = measures.lyap_r(data[:, 1], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_ry, delta=0.5) + with self.subTest(measure="lyap_r", axis="z"): + lyap_rz = measures.lyap_r(data[:, 2], **lyap_r_args) + self.assertAlmostEqual(2.4, lyap_rz, delta=0.5) + with self.subTest(measure="lyap_e", axis="x"): + lyap_ex = measures.lyap_e(data[:, 0], **lyap_e_args) + self.assertGreater(lyap_ex[0], 1.5) + with self.subTest(measure="lyap_e", axis="y"): + lyap_ey = measures.lyap_e(data[:, 1], **lyap_e_args) + self.assertGreater(lyap_ey[0], 1.5) + with self.subTest(measure="lyap_e", axis="z"): + lyap_ez = measures.lyap_e(data[:, 2], **lyap_e_args) + self.assertGreater(lyap_ez[0], 1.5) def test_lyap_fbm(self) -> None: + """Hypothesis: lyap_e produces positive output for fractional brownian motion.""" data = datasets.fbm(1000, H=0.3) - le = nolds.lyap_e(data, emb_dim=7, matrix_dim=3) - assert np.max(le) > 0 + le = measures.lyap_e(data, emb_dim=7, matrix_dim=3) + self.assertGreater(float(np.max(le)), 0) def test_lyap_r_limits(self) -> None: - """Tests if minimal input size is correctly calculated.""" - np.random.seed(0) - for i in range(10): + """Hypothesis: Minimal input size for lyap_r is correctly calculated. + + For each of 10 random parameter settings, we test a range of input sizes around + the supposed minimum number of inputs. For numbers smaller than the calculated + minimum we expect the call of lyap_r to fail, for numbers greater or equal, it + should succeed. + """ + + def expect_fail(required: int, kwargs: dict[str, Any], actual: int) -> None: + msg = ( + f"{required} data points should be required for kwargs {kwargs}, " + f"but {actual} were enough" + ) + with self.assertRaises(ValueError, msg=msg), warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + measures.lyap_r(data, fit="poly", **kwargs) # pyright: ignore reportArgumentType + + def expect_success(required: int, kwargs: dict[str, Any], actual: int) -> None: + msg = ( + f"{required} data points should be enough for kwargs {kwargs}, but " + f"{actual} were too few" + ) + try: + assert np.all(np.isfinite(measures.lyap_r(data, fit="poly", **kwargs))), ( # pyright: ignore reportArgumentType + msg + ) + except ValueError as e: + raise ValueError(msg) from e + + rng = np.random.default_rng(seed=0) + for _ in range(10): kwargs = { - "emb_dim": np.random.randint(1, 10), - "lag": np.random.randint(1, 6), - "min_tsep": np.random.randint(0, 5), - "trajectory_len": np.random.randint(2, 10), + "emb_dim": rng.integers(1, 10), + "lag": rng.integers(1, 6), + "min_tsep": rng.integers(0, 5), + "trajectory_len": rng.integers(2, 10), } - min_len = nolds.lyap_r_len(**kwargs) - for i in reversed(range(max(1, min_len - 5), min_len + 5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_r(data, fit="poly", **kwargs) - msg = "{} data points should be required for kwargs {}, but {} where enough" - self.fail( - msg.format( - min_len, - kwargs, - i, - ) - ) - except ValueError: - # print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but {} where too few" - try: - assert np.all(np.isfinite(nolds.lyap_r(data, fit="poly", **kwargs))), ( - msg.format(min_len, kwargs, i) - ) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: " + str(e), - ) + min_len = measures.lyap_r_len(**kwargs) # pyright: ignore reportArgumentType + for actual_len in reversed(range(max(1, min_len - 5), min_len + 5)): + data = rng.random(actual_len) + with self.subTest( + emb_dim=kwargs["emb_dim"], + lag=kwargs["lag"], + min_tsep=kwargs["min_tsep"], + trajectory_len=kwargs["trajectory_len"], + min_len=min_len, + actual_len=actual_len, + ): + if actual_len < min_len: + ## too few data points => execution should fail + expect_fail(min_len, kwargs, actual_len) + else: + ## enough data points => execution should succeed + expect_success(min_len, kwargs, actual_len) def test_lyap_e_limits(self) -> None: """Tests if minimal input size is correctly calculated.""" @@ -229,7 +286,7 @@ def test_lyap_e_limits(self) -> None: "min_nb": np.random.randint(2, 15), } kwargs["emb_dim"] = np.random.randint(1, 4) * (kwargs["matrix_dim"] - 1) + 1 - min_len = nolds.lyap_e_len(**kwargs) + min_len = measures.lyap_e_len(**kwargs) for i in reversed(range(max(1, min_len - 5), min_len + 5)): data = np.random.random(i) if i < min_len: @@ -237,7 +294,7 @@ def test_lyap_e_limits(self) -> None: try: with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) - nolds.lyap_e(data, **kwargs) + measures.lyap_e(data, **kwargs) msg = "{} data points should be required for kwargs {}, but {} where enough" self.fail( msg.format( @@ -253,7 +310,7 @@ def test_lyap_e_limits(self) -> None: ## enough data points => execution should succeed msg = "{} data points should be enough for kwargs {}, but {} where too few" try: - assert np.all(np.isfinite(nolds.lyap_e(data, **kwargs))), msg.format( + assert np.all(np.isfinite(measures.lyap_e(data, **kwargs))), msg.format( min_len, kwargs, i ) except ValueError as e: @@ -273,14 +330,14 @@ def test_hurst_basic(self) -> None: for _ in range(10000): x = -x + np.random.random() - 0.5 seq_neg.append(x) - h_neg = nolds.hurst_rs(seq_neg) + h_neg = measures.hurst_rs(seq_neg) # print("h_neg = %.3f" % h_neg) # expected h is around 0 assert h_neg < 0.3 # no correlation, just random noise x = np.random.randn(10000) - h_rand = nolds.hurst_rs(x) + h_rand = measures.hurst_rs(x) # print("h_rand = %.3f" % h_rand) # expected h is around 0.5 self.assertAlmostEqual(h_rand, 0.5, delta=0.1) @@ -288,7 +345,7 @@ def test_hurst_basic(self) -> None: # cumulative sum has strong positive correlation between # elements walk = np.cumsum(x) - h_walk = nolds.hurst_rs(walk) + h_walk = measures.hurst_rs(walk) # print("h_walk = %.3f" % h_walk) # expected h is around 1.0 assert h_walk > 0.9 @@ -298,7 +355,7 @@ def test_hurst_pracma(self) -> None: np.random.seed(3) # This test reproduces the results presented by Ian L. Kaplan on # bearcave.com - h72 = nolds.hurst_rs( + h72 = measures.hurst_rs( datasets.brown72, fit="poly", corrected=False, @@ -309,12 +366,12 @@ def test_hurst_pracma(self) -> None: self.assertAlmostEqual(h72, 0.72, delta=0.01) xgn = np.random.normal(size=10000) - hgn = nolds.hurst_rs(xgn, fit="poly") + hgn = measures.hurst_rs(xgn, fit="poly") # print("hgn = %.3f" % hgn) self.assertAlmostEqual(hgn, 0.5, delta=0.1) - xlm = np.fromiter(datasets.logistic_map(0.1, 1024), dtype="float32") - hlm = nolds.hurst_rs(xlm, fit="poly", nvals=2 ** np.arange(3, 11)) + xlm = np.fromiter(datasets.logistic_map(0.1, 1024), dtype=np.float64) + hlm = measures.hurst_rs(xlm, fit="poly", nvals=2 ** np.arange(3, 11)) # print("hlm = %.3f" % hlm) self.assertAlmostEqual(hlm, 0.43, delta=0.05) @@ -332,10 +389,10 @@ def test_hurst_lorenz(self) -> None: 2009, doi: 10.1007/s11207-009-9467-x. """ data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] - hurst_rs_args = {"fit": "poly", "nvals": nolds.logarithmic_n(10, 70, 1.1)} - hx = nolds.hurst_rs(data[:, 0], **hurst_rs_args) - hy = nolds.hurst_rs(data[:, 1], **hurst_rs_args) - hz = nolds.hurst_rs(data[:, 2], **hurst_rs_args) + hurst_rs_args = {"fit": "poly", "nvals": measures.logarithmic_n(10, 70, 1.1)} + hx = measures.hurst_rs(data[:, 0], **hurst_rs_args) + hy = measures.hurst_rs(data[:, 1], **hurst_rs_args) + hz = measures.hurst_rs(data[:, 2], **hurst_rs_args) self.assertAlmostEqual(0.9, hx, delta=0.05) self.assertAlmostEqual(0.9, hy, delta=0.05) self.assertAlmostEqual(0.9, hz, delta=0.05) @@ -352,13 +409,13 @@ def test_dfa_base(self) -> None: for _ in range(10000): x = -x + np.random.random() - 0.5 seq_neg.append(x) - h_neg = nolds.dfa(seq_neg) + h_neg = measures.dfa(seq_neg) # expected h is around 0 assert h_neg < 0.3 # no correlation, just random noise x = np.random.randn(10000) - h_rand = nolds.dfa(x) + h_rand = measures.dfa(x) # expected h is around 0.5 assert h_rand < 0.7 assert h_rand > 0.3 @@ -366,7 +423,7 @@ def test_dfa_base(self) -> None: # cumulative sum has strong positive correlation between # elements walk = np.cumsum(x) - h_walk = nolds.dfa(walk) + h_walk = measures.dfa(walk) # expected h is around 1.0 assert h_walk > 0.7 @@ -374,7 +431,7 @@ def test_dfa_fbm(self) -> None: hs = [0.3, 0.5, 0.7] for h in hs: data = datasets.fbm(1000, H=h) - he = nolds.dfa(data) + he = measures.dfa(data) self.assertAlmostEqual(he, h + 1, delta=0.15) def test_dfa_lorenz(self) -> None: @@ -395,11 +452,11 @@ def test_dfa_lorenz(self) -> None: data = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ 20000: ] - nvals = nolds.logarithmic_n(200, len(data) / 8, 2**0.2) + nvals = measures.logarithmic_n(200, len(data) / 8, 2**0.2) dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} - dx = nolds.dfa(data[:, 0], **dfa_args) - dy = nolds.dfa(data[:, 1], **dfa_args) - dz = nolds.dfa(data[:, 2], **dfa_args) + dx = measures.dfa(data[:, 0], **dfa_args) + dy = measures.dfa(data[:, 1], **dfa_args) + dz = measures.dfa(data[:, 2], **dfa_args) self.assertAlmostEqual(1.008, dx, delta=0.04) self.assertAlmostEqual(0.926, dy, delta=0.032) self.assertAlmostEqual(0.650, dz, delta=0.44) @@ -408,7 +465,7 @@ def test_dfa_agreement_with_physionet(self) -> None: """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" lorenz_x, physionet_points = datasets.load_lorenz_physionet() nvals = [round(x) for x in 10 ** physionet_points[:, 0]] - _, (_, nolds_rs, _) = nolds.dfa( + _, (_, nolds_rs, _) = measures.dfa( lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True ) nolds_rs_log10 = nolds_rs / np.log(10) @@ -423,7 +480,7 @@ def test_dfa_levy(self) -> None: """ alpha = 1.5 x = levy_stable.rvs(alpha=alpha, beta=0, size=10000) - h = nolds.dfa(x, fit_exp="poly") + h = measures.dfa(x, fit_exp="poly") self.assertAlmostEqual(0.5, h, delta=0.1) @@ -434,11 +491,11 @@ def test_corr_dim(self) -> None: np.random.seed(5) n = 1000 data = np.arange(n) - cd = nolds.corr_dim(data, 4) + cd = measures.corr_dim(data, 4) self.assertAlmostEqual(cd, 1, delta=0.05) # TODO what is the prescribed correlation dimension for random data? data = np.random.random(n) - cd = nolds.corr_dim(data, 4, fit="poly") + cd = measures.corr_dim(data, 4, fit="poly") self.assertAlmostEqual(cd, 0.5, delta=0.15) # TODO test example for cd > 1 @@ -463,8 +520,8 @@ def test_lorenz(self) -> None: emb_dim = 5 data = datasets.lorenz_euler(n + discard, 10, 28, 8 / 3, start=(1, 1, 1), dt=0.012) x = data[discard:, 1] - rvals = nolds.logarithmic_r(1, np.e, 1.1) # determined experimentally - cd = nolds.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) + rvals = measures.logarithmic_r(1, np.e, 1.1) # determined experimentally + cd = measures.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) self.assertAlmostEqual(cd, 2.05, delta=0.2) def test_logistic(self) -> None: @@ -479,44 +536,44 @@ def test_sampen_base(self) -> None: data = [0, 1, 5, 4, 1, 0, 1, 5, 3] # matches for m=2: 01-01, 15-15 # matches for m=3: 015-015 - se = nolds.sampen(data) + se = measures.sampen(data) self.assertAlmostEqual(se, -np.log(1.0 / 2), delta=0.01) data = [1, 2, 1, 2.4, 1, 4] # matches for m=1: 1-1,1-1,2-2.4,1-1 # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] - se = nolds.sampen(data, emb_dim=1, tolerance=0.5) + se = measures.sampen(data, emb_dim=1, tolerance=0.5) self.assertAlmostEqual(se, -np.log(2.0 / 4), delta=0.01) data = [0, 20, 1, 2, 3, 4, 40, 60, 1.4, 2.4, 3.4, 80, 100, 1.4, 2.4, 3.4, 4, 120, 140, 180] # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] - se = nolds.sampen(data, emb_dim=3, tolerance=0.5) + se = measures.sampen(data, emb_dim=3, tolerance=0.5) self.assertAlmostEqual(se, -np.log(1.0 / 4), delta=0.01) def test_sampen_logistic(self) -> None: # logistic map with r = 2.8 => static value data = list(datasets.logistic_map(0.45, 1000, r=2.8)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) # logistic map with r = 3.3 => oscillation between two values data = list(datasets.logistic_map(0.45, 1000, r=3.3)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) # logistic map with r = 3.5 => oscillation between four values data = list(datasets.logistic_map(0.45, 1000, r=3.5)) - self.assertAlmostEqual(0, nolds.sampen(data), delta=0.001) - self.assertAlmostEqual(0, nolds.sampen(data[100:], emb_dim=5), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) # logistic map with r = 3.9 => chaotic behavior data = list(datasets.logistic_map(0.45, 1000, r=3.9)) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:]), delta=0.1) - self.assertAlmostEqual(0.5, nolds.sampen(data[100:], emb_dim=5), delta=0.1) + self.assertAlmostEqual(0.5, measures.sampen(data[100:]), delta=0.1) + self.assertAlmostEqual(0.5, measures.sampen(data[100:], emb_dim=5), delta=0.1) def test_sampen_random(self) -> None: np.random.seed(6) # normally distributed random numbers data = np.random.randn(10000) - self.assertAlmostEqual(2.2, nolds.sampen(data), delta=0.1) - self.assertAlmostEqual(2.2, nolds.sampen(data, emb_dim=2), delta=0.1) + self.assertAlmostEqual(2.2, measures.sampen(data), delta=0.1) + self.assertAlmostEqual(2.2, measures.sampen(data, emb_dim=2), delta=0.1) # TODO add tests with uniformly distributed random numbers def test_sampen_sinus(self) -> None: @@ -539,9 +596,9 @@ def test_sampen_lorenz(self) -> None: """ data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] sampen_args = {"emb_dim": 2, "lag": 1} - sx = nolds.sampen(data[:, 0], **sampen_args) - sy = nolds.sampen(data[:, 1], **sampen_args) - sz = nolds.sampen(data[:, 2], **sampen_args) + sx = measures.sampen(data[:, 0], **sampen_args) + sy = measures.sampen(data[:, 1], **sampen_args) + sz = measures.sampen(data[:, 2], **sampen_args) self.assertAlmostEqual(0.15, sx, delta=0.05) self.assertAlmostEqual(0.15, sy, delta=0.05) self.assertAlmostEqual(0.25, sz, delta=0.05) @@ -557,16 +614,16 @@ class RegressionTests(unittest.TestCase): def test_sampen(self) -> None: """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - se = nolds.sampen( - data, emb_dim=2, tolerance=None, lag=1, dist=nolds.rowwise_chebyshev, closed=False + se = measures.sampen( + data, emb_dim=2, tolerance=None, lag=1, dist=measures.rowwise_chebyshev, closed=False ) self.assertAlmostEqual(2.1876999522832743, se, places=14) def test_corr_dim(self) -> None: """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - cd = nolds.corr_dim( - data, emb_dim=5, lag=1, rvals=None, dist=nolds.rowwise_euclidean, fit="poly" + cd = measures.corr_dim( + data, emb_dim=5, lag=1, rvals=None, dist=measures.rowwise_euclidean, fit="poly" ) self.assertAlmostEqual(1.303252839255068, cd, places=14) @@ -578,16 +635,16 @@ def test_corr_dim_RANSAC(self) -> None: # fix seed np.random.seed(42) # usa a too wide range for rvals to give RANSAC something to do ;) - rvals = nolds.logarithmic_r(0.01 * sd, 2 * sd, 1.03) - cd = nolds.corr_dim( - data, emb_dim=5, lag=1, rvals=rvals, dist=nolds.rowwise_euclidean, fit="RANSAC" + rvals = measures.logarithmic_r(0.01 * sd, 2 * sd, 1.03) + cd = measures.corr_dim( + data, emb_dim=5, lag=1, rvals=rvals, dist=measures.rowwise_euclidean, fit="RANSAC" ) self.assertAlmostEqual(0.44745494643404665, cd, places=14) def test_lyap_e(self) -> None: """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - le = nolds.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) + le = measures.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) expected = np.array( [0.03779942603329712, -0.014314012551504982, -0.08436867977030214, -0.22316730257003717] ) @@ -599,7 +656,7 @@ def test_lyap_e(self) -> None: def test_lyap_r(self) -> None: """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - le = nolds.lyap_r( + le = measures.lyap_r( data, emb_dim=10, lag=1, @@ -619,7 +676,7 @@ def test_lyap_r_RANSAC(self) -> None: np.random.seed(42) # set lag to 2 for weird duplicate lines # set trajectory_len to 100 to get many datapoints for RANSAC to choose from - le = nolds.lyap_r( + le = measures.lyap_r( data, emb_dim=10, lag=2, @@ -635,7 +692,7 @@ def test_lyap_r_RANSAC(self) -> None: def test_hurst_rs(self) -> None: """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - rs = nolds.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) + rs = measures.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) expected = 0.5123887964986258 self.assertAlmostEqual(expected, rs, places=14) @@ -645,15 +702,15 @@ def test_hurst_rs_RANSAC(self) -> None: data = datasets.load_qrandom()[:1000] np.random.seed(42) # increase nsteps in nvals to have more data points for RANSAC to choose from - nvals = nolds.logmid_n(data.shape[0], ratio=1 / 4.0, nsteps=100) - rs = nolds.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) + nvals = measures.logmid_n(data.shape[0], ratio=1 / 4.0, nsteps=100) + rs = measures.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) expected = 0.4805431939943321 self.assertAlmostEqual(expected, rs, places=14) def test_dfa(self) -> None: """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - h = nolds.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") + h = measures.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") expected = 0.5450874638765073 self.assertAlmostEqual(expected, h, places=14) @@ -664,15 +721,17 @@ def test_dfa_RANSAC(self) -> None: data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 np.random.seed(42) # adds more steps and higher values to nvals to introduce some scattering for RANSAC to have an effect on - nvals = nolds.logarithmic_n(10, 0.9 * data.shape[0], 1.1) - h = nolds.dfa(data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC") + nvals = measures.logarithmic_n(10, 0.9 * data.shape[0], 1.1) + h = measures.dfa( + data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC" + ) expected = 1.1372303125405405 self.assertAlmostEqual(expected, h, places=14) def test_mfhurst_b(self) -> None: """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="poly") + h = measures.mfhurst_b(data, qvals=[1], dists=None, fit="poly") expected = [-0.00559398934417339] self.assertAlmostEqual(expected[0], h[0], places=14) @@ -681,14 +740,16 @@ def test_mfhurst_b_RANSAC(self) -> None: """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) - h = nolds.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") + h = measures.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") expected = [-0.009056463064211057] self.assertAlmostEqual(expected[0], h[0], places=14) def test_mfhurst_dm(self) -> None: """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly") + h, _ = measures.mfhurst_dm( + data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly" + ) expected = [0.008762803881203145] self.assertAlmostEqual(expected[0], h[0], places=14) @@ -697,7 +758,9 @@ def test_mfhurst_dm_RANSAC(self) -> None: """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" data = datasets.load_qrandom()[:1000] np.random.seed(42) - h, _ = nolds.mfhurst_dm(data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC") + h, _ = measures.mfhurst_dm( + data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC" + ) expected = [0.005324834328837356] self.assertAlmostEqual(expected[0], h[0], places=14) @@ -716,7 +779,7 @@ def test_lyap_r_complex_min_tsep(self) -> None: # TypeError: ufunc 'ceil' not supported for the input types, and the # inputs could not be safely coerced to any supported types according to # the casting rule ''safe'' - nolds.lyap_r(data) + measures.lyap_r(data) if __name__ == "__main__": From 8124d74e09c589a0073ac272b2e8ff1aef994eb7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 25 Aug 2025 15:13:37 +0200 Subject: [PATCH 23/36] modernizes and cleans up length tests for Lyapunov measures --- nolds/test_measures.py | 148 ++++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 61 deletions(-) diff --git a/nolds/test_measures.py b/nolds/test_measures.py index 618e7c2..7944289 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -6,7 +6,7 @@ import warnings # from numpy.testing import assert_equal as assert_array_equal -from typing import Any +from typing import Any, Protocol import numpy as np from numpy.typing import ArrayLike, DTypeLike @@ -142,6 +142,13 @@ def test_logarithmic_r(self) -> None: self.assertSequenceEqual(x, [4, 6.04, 9.1204]) +class NoldsMeasure(Protocol): + """Protocol for typing methods that take a float array as first parameter.""" + + def __call__(self, data: measures.FloatArrayLike1D) -> Any: # noqa: ANN401 + """Call the measure.""" + + class TestNoldsLyap(unittest.TestCase): """Tests for lyap_e and lyap_r.""" @@ -220,6 +227,53 @@ def test_lyap_fbm(self) -> None: le = measures.lyap_e(data, emb_dim=7, matrix_dim=3) self.assertGreater(float(np.max(le)), 0) + def assert_insufficient_length( + self, + min_len: int, + kwargs: dict[str, Any], + input_data: measures.FloatArray1D, + measure: NoldsMeasure, + ) -> None: + """Ensures that the length of the given data would actually lead to an error. + + Args: + min_len: reported minimum length + kwargs: kwargs to be passed to the measure + input_data: data with length less than `min_len` + measure: the nolds measure to test (either `lyap_r` or `lyap_e`) + """ + msg = ( + f"{min_len} data points should be required for kwargs {kwargs}, " + f"but {input_data.shape[0]} were enough" + ) + with self.assertRaises(ValueError, msg=msg), warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + measure(input_data, **kwargs) # pyright: ignore reportArgumentType + + def assert_sufficient_length( + self, + min_len: int, + kwargs: dict[str, Any], + input_data: measures.FloatArray1D, + measure: NoldsMeasure, + ) -> None: + """Ensures that the length of the given data does not lead to an error. + + Args: + min_len: reported minimum length + kwargs: kwargs to be passed to the measure + input_data: data with length at least `min_len` + measure: the nolds measure to test (either `lyap_r` or `lyap_e`) + """ + msg = ( + f"{min_len} data points should be enough for kwargs {kwargs}, but " + f"{input_data.shape[0]} were too few" + ) + try: + assert np.all(np.isfinite(measure(input_data, **kwargs))), msg + except ValueError as e: + raise ValueError(msg) from e + def test_lyap_r_limits(self) -> None: """Hypothesis: Minimal input size for lyap_r is correctly calculated. @@ -228,37 +282,16 @@ def test_lyap_r_limits(self) -> None: minimum we expect the call of lyap_r to fail, for numbers greater or equal, it should succeed. """ - - def expect_fail(required: int, kwargs: dict[str, Any], actual: int) -> None: - msg = ( - f"{required} data points should be required for kwargs {kwargs}, " - f"but {actual} were enough" - ) - with self.assertRaises(ValueError, msg=msg), warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - measures.lyap_r(data, fit="poly", **kwargs) # pyright: ignore reportArgumentType - - def expect_success(required: int, kwargs: dict[str, Any], actual: int) -> None: - msg = ( - f"{required} data points should be enough for kwargs {kwargs}, but " - f"{actual} were too few" - ) - try: - assert np.all(np.isfinite(measures.lyap_r(data, fit="poly", **kwargs))), ( # pyright: ignore reportArgumentType - msg - ) - except ValueError as e: - raise ValueError(msg) from e - rng = np.random.default_rng(seed=0) for _ in range(10): - kwargs = { + kwargs: dict[str, Any] = { "emb_dim": rng.integers(1, 10), "lag": rng.integers(1, 6), "min_tsep": rng.integers(0, 5), "trajectory_len": rng.integers(2, 10), } min_len = measures.lyap_r_len(**kwargs) # pyright: ignore reportArgumentType + kwargs["fit"] = "poly" for actual_len in reversed(range(max(1, min_len - 5), min_len + 5)): data = rng.random(actual_len) with self.subTest( @@ -271,51 +304,44 @@ def expect_success(required: int, kwargs: dict[str, Any], actual: int) -> None: ): if actual_len < min_len: ## too few data points => execution should fail - expect_fail(min_len, kwargs, actual_len) + self.assert_insufficient_length( + min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_r + ) else: ## enough data points => execution should succeed - expect_success(min_len, kwargs, actual_len) + self.assert_sufficient_length( + min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_r + ) def test_lyap_e_limits(self) -> None: """Tests if minimal input size is correctly calculated.""" - np.random.seed(1) - for i in range(10): + rng = np.random.default_rng(seed=1) + for _ in range(10): kwargs = { - "matrix_dim": np.random.randint(2, 10), - "min_tsep": np.random.randint(0, 10), - "min_nb": np.random.randint(2, 15), + "matrix_dim": rng.integers(2, 10), + "min_tsep": rng.integers(0, 10), + "min_nb": rng.integers(2, 15), } - kwargs["emb_dim"] = np.random.randint(1, 4) * (kwargs["matrix_dim"] - 1) + 1 - min_len = measures.lyap_e_len(**kwargs) - for i in reversed(range(max(1, min_len - 5), min_len + 5)): - data = np.random.random(i) - if i < min_len: - ## too few data points => execution should fail - try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", RuntimeWarning) - measures.lyap_e(data, **kwargs) - msg = "{} data points should be required for kwargs {}, but {} where enough" - self.fail( - msg.format( - min_len, - kwargs, - i, - ) - ) - except ValueError: - # print(e) - pass - else: - ## enough data points => execution should succeed - msg = "{} data points should be enough for kwargs {}, but {} where too few" - try: - assert np.all(np.isfinite(measures.lyap_e(data, **kwargs))), msg.format( - min_len, kwargs, i + kwargs["emb_dim"] = rng.integers(1, 4) * (kwargs["matrix_dim"] - 1) + 1 + min_len = measures.lyap_e_len(**kwargs) # pyright: ignore reportArgumentType + for actual_len in reversed(range(max(1, min_len - 5), min_len + 5)): + data = rng.random(actual_len) + with self.subTest( + matrix_dim=kwargs["matrix_dim"], + min_tsep=kwargs["min_tsep"], + min_nb=kwargs["min_nb"], + min_len=min_len, + actual_len=actual_len, + ): + if actual_len < min_len: + ## too few data points => execution should fail + self.assert_insufficient_length( + min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_e ) - except ValueError as e: - self.fail( - msg.format(min_len, kwargs, i) + ", original error: " + str(e), + else: + ## enough data points => execution should succeed + self.assert_sufficient_length( + min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_e ) From 2faf52fbc4e42c9cc244fbec605a93c3e511095f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 25 Aug 2025 19:25:04 +0200 Subject: [PATCH 24/36] finishes modernizing tests --- nolds/test_measures.py | 563 +++++++++++++++++++++++++++-------------- 1 file changed, 371 insertions(+), 192 deletions(-) diff --git a/nolds/test_measures.py b/nolds/test_measures.py index 7944289..f64dfa3 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -4,20 +4,19 @@ import unittest import warnings - -# from numpy.testing import assert_equal as assert_array_equal -from typing import Any, Protocol +from typing import TYPE_CHECKING, Any, Protocol, cast import numpy as np -from numpy.typing import ArrayLike, DTypeLike +from numpy.testing import assert_almost_equal -from nolds import datasets, measures +if TYPE_CHECKING: + from numpy.typing import ArrayLike, DTypeLike -# TODO add tests for mfhurst_b and mfhurst_dm +from nolds import datasets, measures -# TODO add more tests using fgn and fbm for hurst_rs and dfa +# TODO: add tests for mfhurst_b and mfhurst_dm -# TODO split up tests into smaller units => one hypothesis = one test +# TODO: add more tests using fgn and fbm for hurst_rs and dfa try: from scipy.stats import levy_stable @@ -201,7 +200,13 @@ def test_lyap_lorenz(self) -> None: "fit_offset": 8, "fit": "poly", } - lyap_e_args = {"min_tsep": 10, "emb_dim": 5, "matrix_dim": 5, "tau": 0.01, "min_nb": 8} + lyap_e_args = { + "min_tsep": 10, + "emb_dim": 5, + "matrix_dim": 5, + "tau": 0.01, + "min_nb": 8, + } with self.subTest(measure="lyap_r", axis="x"): lyap_rx = measures.lyap_r(data[:, 0], **lyap_r_args) self.assertAlmostEqual(2.4, lyap_rx, delta=0.5) @@ -305,12 +310,18 @@ def test_lyap_r_limits(self) -> None: if actual_len < min_len: ## too few data points => execution should fail self.assert_insufficient_length( - min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_r + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_r, ) else: ## enough data points => execution should succeed self.assert_sufficient_length( - min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_r + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_r, ) def test_lyap_e_limits(self) -> None: @@ -336,51 +347,62 @@ def test_lyap_e_limits(self) -> None: if actual_len < min_len: ## too few data points => execution should fail self.assert_insufficient_length( - min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_e + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_e, ) else: ## enough data points => execution should succeed self.assert_sufficient_length( - min_len=min_len, kwargs=kwargs, input_data=data, measure=measures.lyap_e + min_len=min_len, + kwargs=kwargs, + input_data=data, + measure=measures.lyap_e, ) class TestNoldsHurst(unittest.TestCase): """Tests for hurst_rs.""" - def test_hurst_basic(self) -> None: - np.random.seed(2) + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=2) # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() + cls.negative_correlation = [] + x = rng.random() for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = measures.hurst_rs(seq_neg) - # print("h_neg = %.3f" % h_neg) + x = -x + rng.random() - 0.5 + cls.negative_correlation.append(x) + # no correlation, just gaussian noise + cls.no_correlation = rng.standard_normal(10000) + # cumulative sum has strong positive correlation between + # elements + cls.positive_correlation = np.cumsum(cls.no_correlation) + + def test_hurst_negative_correlation(self) -> None: + """Hypothesis: H < 0.5 for data with negative correlation between successive elements.""" + h_neg = measures.hurst_rs(self.negative_correlation) # expected h is around 0 - assert h_neg < 0.3 + self.assertLess(h_neg, 0.3) - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = measures.hurst_rs(x) - # print("h_rand = %.3f" % h_rand) + def test_hurst_gaussian_noise(self) -> None: + """Hypothesis: H ~= 0.5 for gaussian noise.""" + h_rand = measures.hurst_rs(self.no_correlation) # expected h is around 0.5 self.assertAlmostEqual(h_rand, 0.5, delta=0.1) - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = measures.hurst_rs(walk) - # print("h_walk = %.3f" % h_walk) + def test_hurst_positive_correlation(self) -> None: + """Hypothesis: H > 0.5 for data with positive correlation between successive elements.""" + h_walk = measures.hurst_rs(self.positive_correlation) # expected h is around 1.0 - assert h_walk > 0.9 + self.assertGreater(h_walk, 0.9) - def test_hurst_pracma(self) -> None: - """Tests for hurst_rs using the same tests as in the R-package pracma.""" - np.random.seed(3) + def test_hurst_pracma_bearcave(self) -> None: + """Hypothesis: `hurst_rs` passes test from R-package pracma using brown72 dataset.""" # This test reproduces the results presented by Ian L. Kaplan on - # bearcave.com + # http://bearcave.com/misl/misl_tech/wavelets/hurst/index.html h72 = measures.hurst_rs( datasets.brown72, fit="poly", @@ -388,21 +410,16 @@ def test_hurst_pracma(self) -> None: unbiased=False, nvals=2 ** np.arange(3, 11), ) - # print("h72 = %.3f" % h72) self.assertAlmostEqual(h72, 0.72, delta=0.01) - xgn = np.random.normal(size=10000) - hgn = measures.hurst_rs(xgn, fit="poly") - # print("hgn = %.3f" % hgn) - self.assertAlmostEqual(hgn, 0.5, delta=0.1) - + def test_hurst_pracma_logistic(self) -> None: + """Hypothesis: `hurst_rs` passes test from R-package pracma using logistic map.""" xlm = np.fromiter(datasets.logistic_map(0.1, 1024), dtype=np.float64) hlm = measures.hurst_rs(xlm, fit="poly", nvals=2 ** np.arange(3, 11)) - # print("hlm = %.3f" % hlm) self.assertAlmostEqual(hlm, 0.43, delta=0.05) def test_hurst_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. + """Hypothesis: We get correct values for estimating the hurst exponent of the Lorenz system. All parameter values are chosen to replicate the experiment by Suyal et al. (see [l_3]_) as closely as possible. @@ -416,52 +433,65 @@ def test_hurst_lorenz(self) -> None: """ data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] hurst_rs_args = {"fit": "poly", "nvals": measures.logarithmic_n(10, 70, 1.1)} - hx = measures.hurst_rs(data[:, 0], **hurst_rs_args) - hy = measures.hurst_rs(data[:, 1], **hurst_rs_args) - hz = measures.hurst_rs(data[:, 2], **hurst_rs_args) - self.assertAlmostEqual(0.9, hx, delta=0.05) - self.assertAlmostEqual(0.9, hy, delta=0.05) - self.assertAlmostEqual(0.9, hz, delta=0.05) + with self.subTest(axis="x"): + hx = measures.hurst_rs(data[:, 0], **hurst_rs_args) + self.assertAlmostEqual(0.9, hx, delta=0.05) + with self.subTest(axis="y"): + hy = measures.hurst_rs(data[:, 1], **hurst_rs_args) + self.assertAlmostEqual(0.9, hy, delta=0.05) + with self.subTest(axis="z"): + hz = measures.hurst_rs(data[:, 2], **hurst_rs_args) + self.assertAlmostEqual(0.9, hz, delta=0.05) class TestNoldsDFA(unittest.TestCase): """Tests for dfa.""" - def test_dfa_base(self) -> None: - np.random.seed(4) + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=4) # strong negative correlation between successive elements - seq_neg = [] - x = np.random.random() + cls.negative_correlation = [] + x = rng.random() for _ in range(10000): - x = -x + np.random.random() - 0.5 - seq_neg.append(x) - h_neg = measures.dfa(seq_neg) + x = -x + rng.random() - 0.5 + cls.negative_correlation.append(x) + # no correlation, just gaussian noise + cls.no_correlation = rng.standard_normal(10000) + # cumulative sum has strong positive correlation between + # elements + cls.positive_correlation = np.cumsum(cls.no_correlation) + + def test_dfa_negative_correlation(self) -> None: + """Hypothesis: H < 0.5 for data with negative correlation between successive elements.""" + h_neg = measures.dfa(self.negative_correlation) # expected h is around 0 - assert h_neg < 0.3 + self.assertLess(h_neg, 0.3) - # no correlation, just random noise - x = np.random.randn(10000) - h_rand = measures.dfa(x) - # expected h is around 0.5 - assert h_rand < 0.7 - assert h_rand > 0.3 + def test_dfa_no_correlation(self) -> None: + """Hypothesis: H ~= 0.5 for gaussian noise.""" + h_rand = measures.dfa(self.no_correlation) + self.assertAlmostEqual(h_rand, 0.5, delta=0.2) - # cumulative sum has strong positive correlation between - # elements - walk = np.cumsum(x) - h_walk = measures.dfa(walk) + def test_dfa_positive_correlation(self) -> None: + """Hypothesis: H > 0.5 for data with positive correlation between successive elements.""" + h_walk = measures.dfa(self.positive_correlation) # expected h is around 1.0 + self.assertGreater(h_walk, 0.7) assert h_walk > 0.7 def test_dfa_fbm(self) -> None: + """Hypothesis: H ~= h + 1 for fractional brownian motion with Hurst parameter h.""" hs = [0.3, 0.5, 0.7] for h in hs: - data = datasets.fbm(1000, H=h) - he = measures.dfa(data) - self.assertAlmostEqual(he, h + 1, delta=0.15) + with self.subTest(h=h): + data = datasets.fbm(1000, H=h) + he = measures.dfa(data) + self.assertAlmostEqual(he, h + 1, delta=0.15) def test_dfa_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the Hurst parameter of the Lorenz system. + """Hypothesis: We get correct values for the Lorenz system. All parameter values are chosen to replicate the experiment by Wallot et al. (see [l_5]_) as closely as possible. @@ -478,34 +508,41 @@ def test_dfa_lorenz(self) -> None: data = datasets.lorenz_euler(120000, 10, 28, 8 / 3.0, start=[0.1, 0.1, 0.1], dt=0.002)[ 20000: ] - nvals = measures.logarithmic_n(200, len(data) / 8, 2**0.2) + nvals = measures.logarithmic_n(200, np.ceil(len(data) / 8), 2**0.2) dfa_args = {"nvals": nvals, "order": 2, "overlap": False, "fit_exp": "poly"} - dx = measures.dfa(data[:, 0], **dfa_args) - dy = measures.dfa(data[:, 1], **dfa_args) - dz = measures.dfa(data[:, 2], **dfa_args) - self.assertAlmostEqual(1.008, dx, delta=0.04) - self.assertAlmostEqual(0.926, dy, delta=0.032) - self.assertAlmostEqual(0.650, dz, delta=0.44) + with self.subTest(axis="x"): + dx = measures.dfa(data[:, 0], **dfa_args) + self.assertAlmostEqual(1.008, dx, delta=0.04) + with self.subTest(axis="y"): + dy = measures.dfa(data[:, 1], **dfa_args) + self.assertAlmostEqual(0.926, dy, delta=0.032) + with self.subTest(axis="z"): + dz = measures.dfa(data[:, 2], **dfa_args) + self.assertAlmostEqual(0.650, dz, delta=0.44) def test_dfa_agreement_with_physionet(self) -> None: - """Test hypothesis: Using the same parameters, the output of nolds is identical to the output of PhysioNet.""" + """Hypothesis: The output of nolds is identical to the output of PhysioNet.""" lorenz_x, physionet_points = datasets.load_lorenz_physionet() nvals = [round(x) for x in 10 ** physionet_points[:, 0]] _, (_, nolds_rs, _) = measures.dfa( lorenz_x, nvals=nvals, overlap=False, fit_exp="poly", debug_data=True ) nolds_rs_log10 = nolds_rs / np.log(10) - # assert that sum of squared errors is less than 1e-9 - assert sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) < 1e-09 + with self.subTest(kind="individual"): + assert_almost_equal(nolds_rs_log10, physionet_points[:, 1], decimal=5) + with self.subTest(kind="sse"): + # assert that sum of squared errors is less than 1e-9 + sse = sum((physionet_points[:, 1] - nolds_rs_log10) ** 2) + self.assertLess(sse, 1e-09) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests using Lévy motion require scipy.") def test_dfa_levy(self) -> None: - """Test hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. + """Hypothesis: We get correct values for estimating the Hurst parameter of Lévy motion. Reference: https://github.com/CSchoel/nolds/issues/17#issuecomment-1905472813. """ alpha = 1.5 - x = levy_stable.rvs(alpha=alpha, beta=0, size=10000) + x = cast("np.typing.NDArray[Any]", levy_stable.rvs(alpha=alpha, beta=0, size=10000)) h = measures.dfa(x, fit_exp="poly") self.assertAlmostEqual(0.5, h, delta=0.1) @@ -513,23 +550,32 @@ def test_dfa_levy(self) -> None: class TestNoldsCorrDim(unittest.TestCase): """Tests for corr_dim.""" - def test_corr_dim(self) -> None: - np.random.seed(5) + @classmethod + def setUpClass(cls) -> None: + """Create data required for test methods.""" + rng = np.random.default_rng(seed=5) n = 1000 - data = np.arange(n) - cd = measures.corr_dim(data, 4) + cls.cd1 = np.arange(n) + # TODO: what is the prescribed correlation dimension for random data? + cls.cd0p5 = rng.random(n) + + def test_corr_dim_1(self) -> None: + """Hypothesis: Correlation dimensions is close to 1 for highly correlated dataset.""" + cd = measures.corr_dim(self.cd1, 4) self.assertAlmostEqual(cd, 1, delta=0.05) - # TODO what is the prescribed correlation dimension for random data? - data = np.random.random(n) - cd = measures.corr_dim(data, 4, fit="poly") + + def test_corr_dim_0p5(self) -> None: + """Hypothesis: Correlation dimension is close to 0.5 for dataset without correlations.""" + cd = measures.corr_dim(self.cd0p5, 4, fit="poly") self.assertAlmostEqual(cd, 0.5, delta=0.15) - # TODO test example for cd > 1 + + # TODO: test example for cd > 1 def test_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the correlation dimension of the Lorenz system. + """Hypothesis: We get correct values for the Lorenz system. - All parameter values are chosen to replicate the experiment by Grassberger and Procaccia (1983) - as closely as possible. + All parameter values are chosen to replicate the experiment by Grassberger and Procaccia + (1983) as closely as possible. For performance reasons the size of the input data was reduced and therefore the assert conditions needed to be relaxed a bit. The settings of n, discard, @@ -544,70 +590,118 @@ def test_lorenz(self) -> None: n = 5000 lag = 10 emb_dim = 5 - data = datasets.lorenz_euler(n + discard, 10, 28, 8 / 3, start=(1, 1, 1), dt=0.012) + data = datasets.lorenz_euler(n + discard, 10, 28, 8 / 3, start=[1, 1, 1], dt=0.012) x = data[discard:, 1] rvals = measures.logarithmic_r(1, np.e, 1.1) # determined experimentally cd = measures.corr_dim(x, emb_dim, fit="poly", rvals=rvals, lag=lag) self.assertAlmostEqual(cd, 2.05, delta=0.2) def test_logistic(self) -> None: - # TODO replicate tests with logistic map from grassberger-procaccia - pass + """Hypothesis: We get correct values for the logistic map.""" + # TODO: replicate tests with logistic map from grassberger-procaccia class TestNoldsSampEn(unittest.TestCase): """Tests for sampen.""" - def test_sampen_base(self) -> None: + def test_sampen_2(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=2`.""" data = [0, 1, 5, 4, 1, 0, 1, 5, 3] # matches for m=2: 01-01, 15-15 # matches for m=3: 015-015 se = measures.sampen(data) self.assertAlmostEqual(se, -np.log(1.0 / 2), delta=0.01) + + def test_sampen_1(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=1`.""" data = [1, 2, 1, 2.4, 1, 4] # matches for m=1: 1-1,1-1,2-2.4,1-1 # matches for m=2: [1,2]-[1,2.4], [2,1]-[2.4,1] se = measures.sampen(data, emb_dim=1, tolerance=0.5) self.assertAlmostEqual(se, -np.log(2.0 / 4), delta=0.01) - data = [0, 20, 1, 2, 3, 4, 40, 60, 1.4, 2.4, 3.4, 80, 100, 1.4, 2.4, 3.4, 4, 120, 140, 180] + + def test_sampen_3(self) -> None: + """Hypothesis: `sampen` gives expected results for toy dataset with `emb_dim=3`.""" + data = [ + 0, + 20, + 1, + 2, + 3, + 4, + 40, + 60, + 1.4, + 2.4, + 3.4, + 80, + 100, + 1.4, + 2.4, + 3.4, + 4, + 120, + 140, + 180, + ] # maches for m=3: [1,2,3]-[1.4,2.4,3.4],[1,2,3]-[1.4,2.4,3.4], - # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] + # [2,3,4]-[2.4,3.4,4], [1.4,2.4,3.4]-[1.4,2.4,3.4] # noqa: ERA001 # matches for m=4: [1,2,3,4]-[1.4,2.4,3.4,4] se = measures.sampen(data, emb_dim=3, tolerance=0.5) self.assertAlmostEqual(se, -np.log(1.0 / 4), delta=0.01) - def test_sampen_logistic(self) -> None: + def test_sampen_logistic_static(self) -> None: + """Hypothesis: `sampen` gives correct outputs for logistic map with static value.""" # logistic map with r = 2.8 => static value data = list(datasets.logistic_map(0.45, 1000, r=2.8)) - self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) - self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_oscillation_2(self) -> None: + """Hypothesis: `sampen` is correct for logistic map oscillating between 2 values.""" # logistic map with r = 3.3 => oscillation between two values data = list(datasets.logistic_map(0.45, 1000, r=3.3)) - self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) - self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_oscillation_4(self) -> None: + """Hypothesis: `sampen` is correct for logistic map oscillating between 4 values.""" # logistic map with r = 3.5 => oscillation between four values data = list(datasets.logistic_map(0.45, 1000, r=3.5)) - self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) - self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0, measures.sampen(data), delta=0.001) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0, measures.sampen(data[100:], emb_dim=5), delta=0.001) + + def test_sampen_logistic_chaotic(self) -> None: + """Hypothesis: `sampen` is correct for logistic map with chaotic behavior.""" # logistic map with r = 3.9 => chaotic behavior data = list(datasets.logistic_map(0.45, 1000, r=3.9)) - self.assertAlmostEqual(0.5, measures.sampen(data[100:]), delta=0.1) - self.assertAlmostEqual(0.5, measures.sampen(data[100:], emb_dim=5), delta=0.1) - - def test_sampen_random(self) -> None: - np.random.seed(6) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(0.5, measures.sampen(data[100:]), delta=0.1) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(0.5, measures.sampen(data[100:], emb_dim=5), delta=0.1) + + def test_sampen_gaussian(self) -> None: + """Hypothesis: `sampen` is correct for gaussian noise.""" + rng = np.random.default_rng(seed=6) # normally distributed random numbers - data = np.random.randn(10000) - self.assertAlmostEqual(2.2, measures.sampen(data), delta=0.1) - self.assertAlmostEqual(2.2, measures.sampen(data, emb_dim=2), delta=0.1) - # TODO add tests with uniformly distributed random numbers + data = rng.standard_normal(10000) + with self.subTest(emb_dim=2): + self.assertAlmostEqual(2.2, measures.sampen(data), delta=0.1) + with self.subTest(emb_dim=5): + self.assertAlmostEqual(2, measures.sampen(data[100:], emb_dim=5), delta=0.1) def test_sampen_sinus(self) -> None: - # TODO add test with sinus signal - pass + """Hypothesis: `sampen` is correct for a sinus signal.""" + # TODO: add test with sinus signal def test_sampen_lorenz(self) -> None: - """Test hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. + """Hypothesis: We get correct values for estimating the sample entropy of the Lorenz system. All parameter values are chosen to replicate the experiment by Kaffashi et al. (2008) as closely as possible. @@ -622,12 +716,15 @@ def test_sampen_lorenz(self) -> None: """ data = datasets.lorenz_euler(3000, 10, 28, 8 / 3.0, start=[1, 1, 1], dt=0.01)[1000:] sampen_args = {"emb_dim": 2, "lag": 1} - sx = measures.sampen(data[:, 0], **sampen_args) - sy = measures.sampen(data[:, 1], **sampen_args) - sz = measures.sampen(data[:, 2], **sampen_args) - self.assertAlmostEqual(0.15, sx, delta=0.05) - self.assertAlmostEqual(0.15, sy, delta=0.05) - self.assertAlmostEqual(0.25, sz, delta=0.05) + with self.subTest(axis="x"): + sx = measures.sampen(data[:, 0], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.15, sx, delta=0.05) + with self.subTest(axis="y"): + sy = measures.sampen(data[:, 1], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.15, sy, delta=0.05) + with self.subTest(axis="z"): + sz = measures.sampen(data[:, 2], **sampen_args) # pyright: ignore reportCallIssue + self.assertAlmostEqual(0.25, sz, delta=0.05) class RegressionTests(unittest.TestCase): @@ -637,53 +734,86 @@ class RegressionTests(unittest.TestCase): as updates to core dependencies such as numpy or the Python standard library. """ + @classmethod + def setUpClass(cls) -> None: + """Loads random data for tests.""" + cls.random_data = datasets.load_qrandom()[:1000] + def test_sampen(self) -> None: - """Test hypothesis: The exact output of sampen() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] + """Hypothesis: The exact output of sampen remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ se = measures.sampen( - data, emb_dim=2, tolerance=None, lag=1, dist=measures.rowwise_chebyshev, closed=False + self.random_data, + emb_dim=2, + tolerance=None, + lag=1, + dist=measures.rowwise_chebyshev, + closed=False, ) self.assertAlmostEqual(2.1876999522832743, se, places=14) def test_corr_dim(self) -> None: - """Test hypothesis: The exact output of corr_dim() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] + """Hypothesis: The exact output of corr_dim with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ cd = measures.corr_dim( - data, emb_dim=5, lag=1, rvals=None, dist=measures.rowwise_euclidean, fit="poly" + self.random_data, + emb_dim=5, + lag=1, + rvals=None, + dist=measures.rowwise_euclidean, + fit="poly", ) self.assertAlmostEqual(1.303252839255068, cd, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_corr_dim_RANSAC(self) -> None: - """Test hypothesis: The exact output of corr_dim() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - sd = np.std(data, ddof=1) - # fix seed - np.random.seed(42) + def test_corr_dim_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of corr_dim with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + sd = float(np.std(self.random_data, ddof=1)) # usa a too wide range for rvals to give RANSAC something to do ;) rvals = measures.logarithmic_r(0.01 * sd, 2 * sd, 1.03) cd = measures.corr_dim( - data, emb_dim=5, lag=1, rvals=rvals, dist=measures.rowwise_euclidean, fit="RANSAC" + self.random_data, + emb_dim=5, + lag=1, + rvals=rvals, + dist=measures.rowwise_euclidean, + fit="RANSAC", + random_state=42, ) self.assertAlmostEqual(0.44745494643404665, cd, places=14) def test_lyap_e(self) -> None: - """Test hypothesis: The exact output of lyap_e() on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - le = measures.lyap_e(data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1) + """Hypothesis: The exact output of lyap_e remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + le = measures.lyap_e( + self.random_data, emb_dim=10, matrix_dim=4, min_nb=10, min_tsep=1, tau=1 + ) expected = np.array( - [0.03779942603329712, -0.014314012551504982, -0.08436867977030214, -0.22316730257003717] + [ + 0.03779942603329712, + -0.014314012551504982, + -0.08436867977030214, + -0.22316730257003717, + ] ) - for i in range(le.shape[0]): - self.assertAlmostEqual( - expected[i], le[i], places=14, msg=f"{i + 1}th Lyapunov exponent doesn't match" - ) + assert_almost_equal(le, expected, decimal=14) def test_lyap_r(self) -> None: - """Test hypothesis: The exact output of lyap_r() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] + """Hypothesis: The exact output of lyap_r with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ le = measures.lyap_r( - data, + self.random_data, emb_dim=10, lag=1, min_tsep=1, @@ -696,14 +826,15 @@ def test_lyap_r(self) -> None: self.assertAlmostEqual(expected, le, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_lyap_r_RANSAC(self) -> None: - """Test hypothesis: The exact output of lyap_r() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) + def test_lyap_r_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of lyap_r with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ # set lag to 2 for weird duplicate lines # set trajectory_len to 100 to get many datapoints for RANSAC to choose from le = measures.lyap_r( - data, + self.random_data, emb_dim=10, lag=2, min_tsep=1, @@ -711,83 +842,131 @@ def test_lyap_r_RANSAC(self) -> None: min_neighbors=10, trajectory_len=100, fit="RANSAC", + random_state=42, ) expected = 0.0003401212353253564 self.assertAlmostEqual(expected, le, places=14) def test_hurst_rs(self) -> None: - """Test hypothesis: The exact output of hurst_rs() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - rs = measures.hurst_rs(data, nvals=None, fit="poly", corrected=True, unbiased=True) + """Hypothesis: The exact output of hurst_rs with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + rs = measures.hurst_rs( + self.random_data, nvals=None, fit="poly", corrected=True, unbiased=True + ) expected = 0.5123887964986258 self.assertAlmostEqual(expected, rs, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_hurst_rs_RANSAC(self) -> None: - """Test hypothesis: The exact output of hurst_rs() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) + def test_hurst_rs_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of hurst_rs with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ # increase nsteps in nvals to have more data points for RANSAC to choose from - nvals = measures.logmid_n(data.shape[0], ratio=1 / 4.0, nsteps=100) - rs = measures.hurst_rs(data, nvals=nvals, fit="RANSAC", corrected=True, unbiased=True) + nvals = measures.logmid_n(self.random_data.shape[0], ratio=1 / 4.0, nsteps=100) + rs = measures.hurst_rs( + self.random_data, + nvals=nvals, + fit="RANSAC", + corrected=True, + unbiased=True, + random_state=42, + ) expected = 0.4805431939943321 self.assertAlmostEqual(expected, rs, places=14) def test_dfa(self) -> None: - """Test hypothesis: The exact output of dfa() with `fit_exp=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = measures.dfa(data, nvals=None, overlap=True, order=1, fit_trend="poly", fit_exp="poly") + """Hypothesis: The exact output of dfa with `fit_exp=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.dfa( + self.random_data, + nvals=None, + overlap=True, + order=1, + fit_trend="poly", + fit_exp="poly", + ) expected = 0.5450874638765073 self.assertAlmostEqual(expected, h, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_dfa_RANSAC(self) -> None: - """Test hypothesis: The exact output of dfa() with `fit_exp=RANSAC` on random data hasn't changed since the last version.""" + def test_dfa_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of dfa with `fit_exp=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ # adds trend to data to introduce a less clear line for fitting - data = datasets.load_qrandom()[:1000] + np.arange(1000) * 100 - np.random.seed(42) - # adds more steps and higher values to nvals to introduce some scattering for RANSAC to have an effect on - nvals = measures.logarithmic_n(10, 0.9 * data.shape[0], 1.1) + random_data = self.random_data + np.arange(1000) * 100 + # adds more steps and higher values to nvals to introduce some scattering + # for RANSAC to have an effect on + nvals = measures.logarithmic_n(10, 0.9 * random_data.shape[0], 1.1) h = measures.dfa( - data, nvals=nvals, overlap=True, order=1, fit_trend="poly", fit_exp="RANSAC" + random_data, + nvals=nvals, + overlap=True, + order=1, + fit_trend="poly", + fit_exp="RANSAC", + random_state=42, ) expected = 1.1372303125405405 self.assertAlmostEqual(expected, h, places=14) def test_mfhurst_b(self) -> None: - """Test hypothesis: The exact output of mfhurst_b() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - h = measures.mfhurst_b(data, qvals=[1], dists=None, fit="poly") + """Hypothesis: The exact output of mfhurst_b with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.mfhurst_b(self.random_data, qvals=[1], dists=None, fit="poly") expected = [-0.00559398934417339] self.assertAlmostEqual(expected[0], h[0], places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_b_RANSAC(self) -> None: - """Test hypothesis: The exact output of mfhurst_b() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) - h = measures.mfhurst_b(data, qvals=[1], dists=None, fit="RANSAC") + def test_mfhurst_b_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of mfhurst_b with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ + h = measures.mfhurst_b( + self.random_data, qvals=[1], dists=None, fit="RANSAC", random_state=42 + ) expected = [-0.009056463064211057] self.assertAlmostEqual(expected[0], h[0], places=14) def test_mfhurst_dm(self) -> None: - """Test hypothesis: The exact output of mfhurst_dm() with `fit=poly` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] + """Hypothesis: The exact output of mfhurst_dm with `fit=poly` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ h, _ = measures.mfhurst_dm( - data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="poly" + self.random_data, + qvals=[1], + max_dists=range(5, 20), + detrend=True, + fit="poly", ) expected = [0.008762803881203145] self.assertAlmostEqual(expected[0], h[0], places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") - def test_mfhurst_dm_RANSAC(self) -> None: - """Test hypothesis: The exact output of mfhurst_dm() with `fit=RANSAC` on random data hasn't changed since the last version.""" - data = datasets.load_qrandom()[:1000] - np.random.seed(42) + def test_mfhurst_dm_RANSAC(self) -> None: # noqa: N802 + """Hypothesis: The exact output of mfhurst_dm with `fit=RANSAC` remains unchanged. + + The test uses random data as input and compares outputs to the previous version. + """ h, _ = measures.mfhurst_dm( - data, qvals=[1], max_dists=range(5, 20), detrend=True, fit="RANSAC" + self.random_data, + qvals=[1], + max_dists=range(5, 20), + detrend=True, + fit="RANSAC", + random_state=42, ) - expected = [0.005324834328837356] + expected = [0.0068840609945006685] self.assertAlmostEqual(expected[0], h[0], places=14) @@ -795,7 +974,7 @@ class PreviousDefectTests(unittest.TestCase): """Tests that ensure that a previous bug doesn't come back at some point.""" def test_lyap_r_complex_min_tsep(self) -> None: - """Test hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. + """Hypothesis: The `min_tsep` parameter can be calculated without creating complex numbers. Previously, this would lead to an exception in the code. See https://github.com/CSchoel/nolds/issues/53 for reference. From af05b5a494e165de3ba05ee65b8dfd3fdb169c80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 25 Aug 2025 19:25:24 +0200 Subject: [PATCH 25/36] allows setting random_state for RANSAC --- nolds/measures.py | 84 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 12 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index b29a205..1690a92 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -56,6 +56,7 @@ def poly_fit( y: NumberArray1D, degree: int, fit: FittingMethod = "RANSAC", + random_state: int | None = None, ) -> FloatArray1D: """Fits a polynomial of the given degree to the data. @@ -73,6 +74,7 @@ def poly_fit( y: y-axis values degree: degree of the polynomial fit: algorithm to use for fitting + random_state: Seed for random number generator used for RANSAC """ # check if we can use RANSAC if fit == "RANSAC": @@ -93,7 +95,9 @@ def poly_fit( if fit == "poly": return np.polyfit(x, y, degree) if fit == "RANSAC": - model = sklin.RANSACRegressor(sklin.LinearRegression(fit_intercept=False)) + model = sklin.RANSACRegressor( + sklin.LinearRegression(fit_intercept=False), random_state=random_state + ) xdat = np.asarray(x) if len(xdat.shape) == 1: # interpret 1d-array as list of len(x) samples instead of @@ -186,6 +190,7 @@ def lyap_r( debug_data: Literal[False] = False, plot_file: str | Path | None = None, fit_offset: int = 0, + random_state: int | None = None, ) -> np.float64: ... @@ -204,6 +209,7 @@ def lyap_r( debug_data: Literal[True] = True, plot_file: str | Path | None = None, fit_offset: int = 0, + random_state: int | None = None, ) -> tuple[ np.float64, tuple[ @@ -228,6 +234,7 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 debug_data: bool = False, plot_file: str | Path | None = None, fit_offset: int = 0, + random_state: int | None = None, ) -> ( np.float64 | tuple[ @@ -333,6 +340,8 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 under the given file name instead of directly showing it through ``plt.show()`` fit_offset: neglect the first fit_offset steps when fitting + random_state: Seed for random number generator used for RANSAC + Returns: An estimate of the largest Lyapunov exponent (a positive exponent is @@ -478,7 +487,13 @@ def nb_neighbors(lag_value: int) -> int: poly = np.array([-np.inf, 0], dtype=np.float64) else: # normal line fitting - poly = poly_fit(ks[fit_offset:], div_traj[fit_offset:], 1, fit=fit) + poly = poly_fit( + ks[fit_offset:], + div_traj[fit_offset:], + 1, + fit=fit, + random_state=random_state, + ) if debug_plot: plot_reg( ks[fit_offset:].astype(np.float64), @@ -1231,7 +1246,11 @@ def expected_rs(n: np.integer) -> float: return front * middle * back -def expected_h(nvals: IntArrayLike1D, fit: FittingMethod = "RANSAC") -> float: +def expected_h( + nvals: IntArrayLike1D, + fit: FittingMethod = "RANSAC", + random_state: int | None = None, +) -> float: """Uses expected_rs to calculate the expected value for the Hurst exponent h. Args: @@ -1239,13 +1258,17 @@ def expected_h(nvals: IntArrayLike1D, fit: FittingMethod = "RANSAC") -> float: fit: the fitting method to use for the line fit, either 'poly' for normal least squares polynomial fitting or 'RANSAC' for RANSAC-fitting which is more robust to outliers + random_state: Seed for random number generator used for RANSAC + Returns: expected h for white noise """ nvals = np.asarray(nvals, dtype=np.int32) rsvals = [expected_rs(n) for n in nvals] - poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit) + poly = poly_fit( + np.log(nvals), np.log(rsvals), 1, fit=fit, random_state=random_state + ) return poly[0] @@ -1510,6 +1533,7 @@ def hurst_rs( plot_file: str | Path | None = None, corrected: bool = True, unbiased: bool = True, + random_state: int | None = None, ) -> float: ... @@ -1524,6 +1548,7 @@ def hurst_rs( plot_file: str | Path | None = None, corrected: bool = True, unbiased: bool = True, + random_state: int | None = None, ) -> tuple[ float, tuple[ @@ -1544,6 +1569,7 @@ def hurst_rs( plot_file: str | Path | None = None, corrected: bool = True, unbiased: bool = True, + random_state: int | None = None, ) -> ( float | tuple[ @@ -1683,6 +1709,7 @@ def hurst_rs( (1/(N-1) instead of 1/N) will be used. This should be the default choice, since the true mean of the sequences is not known. This parameter should only be changed to recreate results of other implementations. + random_state: Seed for random number generator used for RANSAC Returns: Estimated Hurst exponent K using a rescaled range approach (if K = 0.5 @@ -1729,7 +1756,7 @@ def hurst_rs( yvals = np.log(rsvals) if corrected: yvals -= np.log([expected_rs(n) for n in nvals]) - poly = poly_fit(xvals, yvals, 1, fit=fit) + poly = poly_fit(xvals, yvals, 1, fit=fit, random_state=random_state) if debug_plot: plot_reg(xvals, yvals, poly, "log(n)", "log((R/S)_n)", fname=plot_file) # account for correction if necessary @@ -1750,6 +1777,7 @@ def mfhurst_b( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> FloatArray1D: ... @@ -1763,6 +1791,7 @@ def mfhurst_b( debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> tuple[ FloatArray1D, tuple[ @@ -1782,6 +1811,7 @@ def mfhurst_b( debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> ( FloatArray1D | tuple[ @@ -1882,6 +1912,7 @@ def mfhurst_b( plot_file: if debug_plot is True and plot_file is not None, the plot will be saved under the given file name instead of directly showing it through ``plt.show()`` + random_state: Seed for random number generator used for RANSAC Returns: list of H_q for every q given in ``qvals``. If ``debug_data`` is True, @@ -1929,7 +1960,10 @@ def hhcorr(d: np.integer, q: np.floating) -> np.floating: xvals = np.log(dists) yvals = np.log(corrvals) polys = np.array( - [poly_fit(xvals, yvals[:, qi], 1, fit=fit) for qi in range(len(qvals))], + [ + poly_fit(xvals, yvals[:, qi], 1, fit=fit, random_state=random_state) + for qi in range(len(qvals)) + ], dtype=np.float64, ) H = np.array(polys)[:, 0] / qvals @@ -2096,6 +2130,7 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> tuple[ FloatArray1D, FloatArray1D, @@ -2113,6 +2148,7 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> tuple[ FloatArray1D, FloatArray1D, @@ -2134,6 +2170,7 @@ def mfhurst_dm( debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> ( tuple[ FloatArray1D, @@ -2211,6 +2248,7 @@ def mfhurst_dm( plot_file: if debug_plot is True and plot_file is not None, the plot will be saved under the given file name instead of directly showing it through ``plt.show()`` + random_state: Seed for random number generator used for RANSAC Returns: tuple containing @@ -2261,7 +2299,7 @@ def mfhurst_dm( step_size = dist stepdata = data[::step_size] if detrend: - stepdata = detrend_data(stepdata, order=1) + stepdata = detrend_data(stepdata, order=1, random_state=random_state) diffs = stepdata[1:] - stepdata[:-1] hhcorr.append( [ @@ -2277,7 +2315,9 @@ def mfhurst_dm( # ranges and does not introduce any new information. H = np.array( [ - poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit)[0] + poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit, random_state=random_state)[ + 0 + ] for qi in range(len(qvals)) for md in max_dists ], @@ -2325,6 +2365,7 @@ def corr_dim( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> float: ... @@ -2346,6 +2387,7 @@ def corr_dim( debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> tuple[ float, tuple[ @@ -2373,6 +2415,7 @@ def corr_dim( debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> ( float | tuple[ @@ -2453,6 +2496,7 @@ def corr_dim( plot_file: if debug_plot is True and plot_file is not None, the plot will be saved under the given file name instead of directly showing it through ``plt.show()`` + random_state: Seed for random number generator used for RANSAC Returns: correlation dimension as slope of the line fitted to log(r) vs log(C(r)) @@ -2505,7 +2549,9 @@ def corr_dim( # all sums are zero => we cannot fit a line poly = np.array([np.nan, np.nan], dtype=np.float64) else: - poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit) + poly = poly_fit( + np.log(rvals), np.log(csums), 1, fit=fit, random_state=random_state + ) if debug_plot: plot_reg( np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file @@ -2519,11 +2565,12 @@ def detrend_data( data: FloatArray1D, order: int = 1, fit: FittingMethod = "poly", + random_state: int | None = None, ) -> FloatArray1D: """Removes a trend of given order from the data.""" # TODO: also use this function in dfa xvals = np.arange(len(data)) - trend = poly_fit(xvals, data, order, fit=fit) + trend = poly_fit(xvals, data, order, fit=fit, random_state=random_state) return data - np.polyval(trend, xvals) @@ -2539,6 +2586,7 @@ def dfa( debug_plot: bool = False, debug_data: Literal[False] = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> float: ... @@ -2554,6 +2602,7 @@ def dfa( debug_plot: bool = False, debug_data: Literal[True] = True, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> tuple[ float, tuple[ @@ -2575,6 +2624,7 @@ def dfa( # noqa: C901, PLR0912, PLR0915 debug_plot: bool = False, debug_data: bool = False, plot_file: str | Path | None = None, + random_state: int | None = None, ) -> ( float | tuple[ @@ -2723,6 +2773,7 @@ def dfa( # noqa: C901, PLR0912, PLR0915 plot_file: if debug_plot is True and plot_file is not None, the plot will be saved under the given file name instead of directly showing it through ``plt.show()`` + random_state: Seed for random number generator used for RANSAC Returns: the estimate alpha for the Hurst parameter (alpha < 1: stationary process similar to fractional Gaussian noise with H = alpha, @@ -2777,7 +2828,10 @@ def dfa( # noqa: C901, PLR0912, PLR0915 d = d.reshape((total_N // n, n)) # calculate local trends as polynomes x = np.arange(n) - tpoly = [poly_fit(x, d[i], order, fit=fit_trend) for i in range(len(d))] + tpoly = [ + poly_fit(x, d[i], order, fit=fit_trend, random_state=random_state) + for i in range(len(d)) + ] tpoly = np.array(tpoly) trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))]) # calculate mean-square differences for each walk in d around trend @@ -2797,7 +2851,13 @@ def dfa( # noqa: C901, PLR0912, PLR0915 # all fluctuations are zero => we cannot fit a line poly = np.array([np.nan, np.nan], dtype=np.float64) else: - poly = poly_fit(np.log(nvals), np.log(fluctuations), 1, fit=fit_exp) + poly = poly_fit( + np.log(nvals), + np.log(fluctuations), + 1, + fit=fit_exp, + random_state=random_state, + ) if debug_plot: plot_reg( np.log(nvals), From c445ffdd76a7bc202ae3b20a4f23075c2399dd52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Mon, 25 Aug 2025 19:25:40 +0200 Subject: [PATCH 26/36] updates ignore rules for tests --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9463d8f..f55f634 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,7 +95,7 @@ convention = "google" [tool.ruff.lint.per-file-ignores] # source: https://github.com/astral-sh/ruff/issues/4368#issue-1705468153 -"test/**/*.py" = [ +"/**/test_*.py" = [ # at least this three should be fine in tests: "S101", # asserts allowed in tests... "ARG", # Unused function args -> fixtures nevertheless are functionally relevant... @@ -104,6 +104,7 @@ convention = "google" "PLR2004", # Magic value used in comparison, ... "S311", # Standard pseudo-random generators are not suitable for cryptographic purposes "INP001", # Test folders should not have an `__init__.py` + "PT", # We're not using pytest, so we want to use the unittest assert methods ] [[tool.uv.index]] From 1888d5d5d1d64c344e88e9cf74c28f74b92b14ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Fri, 29 Aug 2025 21:19:18 +0200 Subject: [PATCH 27/36] ensures that we only discover test files under the main nolds module folder --- .vscode/settings.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 834621f..ee6b32c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -7,7 +7,7 @@ "python.testing.unittestArgs": [ "-v", "-s", - "./nolds", + ".", "-p", "test_*.py" ], From 2e0ac2b19394e000d4fbb3e69f2a531a3f921b26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 17:07:19 +0200 Subject: [PATCH 28/36] ensure that uv uses correct Python version --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1aa9271..76941de 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -24,7 +24,7 @@ jobs: python-version: ${{ matrix.python }} - name: Install uv uses: astral-sh/setup-uv@v5 - - run: uv sync ${{ matrix.extras }} + - run: uv sync ${{ matrix.extras }} --python ${{ matrix.python }} - run: uv pip install codecov - run: uv run coverage run -m unittest nolds.test_measures - run: uv run codecov From 4b3b6805c2680323ee706eabca7bcf077bb5bcaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 17:10:21 +0200 Subject: [PATCH 29/36] makes CI consistent with pyproject --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 76941de..cbbe3fd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,10 +9,10 @@ jobs: build: strategy: matrix: - python: ["3.7", "3.10"] + python: ["3.8", "3.10"] extras: ["", "--all-extras"] include: - - python: "3.7" + - python: "3.8" os: ubuntu-22.04 - python: "3.10" os: ubuntu-latest From 174888e1f87fc90f5ee7f33b7fb72e5667ed2826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 17:32:23 +0200 Subject: [PATCH 30/36] adjusts supported Python range to 3.10-3.13 --- .github/workflows/ci.yaml | 11 +++-------- pyproject.toml | 7 +++---- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cbbe3fd..76153e0 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,14 +9,9 @@ jobs: build: strategy: matrix: - python: ["3.8", "3.10"] + python: ["3.10", "3.13"] extras: ["", "--all-extras"] - include: - - python: "3.8" - os: ubuntu-22.04 - - python: "3.10" - os: ubuntu-latest - runs-on: ${{ matrix.os }} + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -30,4 +25,4 @@ jobs: - run: uv run codecov env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} - if: ${{ matrix.python == '3.10' && matrix.extras != '' }} + if: ${{ matrix.python == '3.13' && matrix.extras != '' }} diff --git a/pyproject.toml b/pyproject.toml index f55f634..581289a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,14 +27,13 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Topic :: Scientific/Engineering :: Bio-Informatics", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ] readme = "README.rst" -requires-python = ">=3.8" +requires-python = ">=3.10" dependencies = ["numpy>1.0,<3.0", "future>=1.0", "setuptools>=72.1.0"] [project.optional-dependencies] @@ -64,7 +63,7 @@ line-length = 100 # allow slightly longer lines indent-width = 4 # Assume Python 3.8 -target-version = "py38" +target-version = "py310" [tool.ruff.format] # Like Black, use double quotes for strings. From 0037d445feffd39912d123c40546b4b7c291d2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 17:39:10 +0200 Subject: [PATCH 31/36] resolves linting errors that occured from switching target Python version --- nolds/datasets.py | 8 ++++-- nolds/examples.py | 2 +- nolds/measures.py | 55 +++++++++++------------------------------- nolds/test_measures.py | 2 +- 4 files changed, 22 insertions(+), 45 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index 645ace6..b07a032 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -5,10 +5,14 @@ import csv import datetime import importlib.resources -from typing import IO, Any, Generator +import itertools +from typing import IO, TYPE_CHECKING, Any import numpy as np +if TYPE_CHECKING: + from collections.abc import Generator + def lorenz_euler( length: int, @@ -480,7 +484,7 @@ def b1991( d, nxtp = b1991(x1, fractal[x1], x2 - x1, fractal[x2 - 1] - fractal[x1]) fractal[x1:x2] = d next_intervals.extend( - [(np1, np2) for np1, np2 in zip(nxtp[:-1], nxtp[1:])], + [(np1, np2) for np1, np2 in itertools.pairwise(nxtp)], ) intervals = next_intervals return fractal diff --git a/nolds/examples.py b/nolds/examples.py index b66b2d9..80bf29f 100644 --- a/nolds/examples.py +++ b/nolds/examples.py @@ -159,7 +159,7 @@ def plot_lyap(maptype: Literal["logistic", "tent"] = "logistic") -> None: x_0 = 0.5 # avoid zero crossings in f'(x) lambdas = [ np.mean(np.log(abs(r - 2 * r * x[np.where(x != x_0)]))) - for x, r in zip(full_data, param_range) + for x, r in zip(full_data, param_range, strict=True) ] elif maptype == "tent": param_name = "$\\mu$" diff --git a/nolds/measures.py b/nolds/measures.py index 1690a92..598e13c 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -6,7 +6,6 @@ import warnings from typing import ( TYPE_CHECKING, - Callable, Literal, TypeAlias, TypeVar, @@ -17,6 +16,7 @@ import numpy as np if TYPE_CHECKING: + from collections.abc import Callable from pathlib import Path from numpy.typing import ArrayLike @@ -119,9 +119,7 @@ def poly_fit( raise ValueError(msg) -def delay_embedding( - data: NumberArrayLike1D, emb_dim: int, lag: int = 1 -) -> FloatArray2D: +def delay_embedding(data: NumberArrayLike1D, emb_dim: int, lag: int = 1) -> FloatArray2D: """Perform a time-delay embedding of a time series. Args: @@ -446,9 +444,7 @@ def nb_neighbors(lag_value: int) -> int: raise ValueError(msg.format(-ntraj + 1)) if ntraj < min_traj: # not enough data points => there are rows where all values are inf - assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), ( - "no inf rows found" - ) + assert np.any(np.all(np.isinf(dists[:ntraj, :ntraj]), axis=1)), "no inf rows found" msg = ( "Not enough data points. At least {} trajectories are required " "to find a valid neighbor for each orbit vector with min_tsep={} " @@ -874,9 +870,7 @@ def plot_dists( std = np.std(dists_full, ddof=1) rng = (0.0, float(mean + std * nstd)) colors = ["green", "blue"] - for i, (h, bins) in enumerate( - [np.histogram(dat, bins=nbins, range=rng) for dat in dists] - ): + for i, (h, bins) in enumerate([np.histogram(dat, bins=nbins, range=rng) for dat in dists]): bw = bins[1] - bins[0] plt.bar(bins[:-1], h, bw, label=f"m={m + i:d}", color=colors[i], alpha=0.5) plt.axvline(tolerance, color="red") @@ -1266,9 +1260,7 @@ def expected_h( """ nvals = np.asarray(nvals, dtype=np.int32) rsvals = [expected_rs(n) for n in nvals] - poly = poly_fit( - np.log(nvals), np.log(rsvals), 1, fit=fit, random_state=random_state - ) + poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit, random_state=random_state) return poly[0] @@ -1321,9 +1313,7 @@ def rs(data: FloatArray1D, n: np.integer, *, unbiased: bool = True) -> float: def plot_histogram_matrix( data: FloatArray2D, name: str, - bin_range: Literal[ - "absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma" - ] = "3sigma", + bin_range: Literal["absmax", "1sigma", "2sigma", "3sigma", "4sigma", "5sigma"] = "3sigma", fname: str | Path | None = None, ) -> None: """Plot a quadratic matrix of histograms. @@ -1460,9 +1450,7 @@ def plot_reg_tiled( plt.subplot(int(np.ceil(len(xvals) / columns)), columns, i + 1) plt.plot(xvals[i], yvals[i], "bo", label=data_labels[i]) if polys is not None: - plt.plot( - xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i] - ) + plt.plot(xvals[i], np.polyval(polys[i], xvals[i]), "r-", label=reg_labels[i]) plt.xlabel(x_label) plt.ylabel(y_label) plt.ylim(means[i] - max_span / 2, means[i] + max_span / 2) @@ -1970,10 +1958,8 @@ def hhcorr(d: np.integer, q: np.floating) -> np.floating: if debug_plot: plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), - np.array( - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64 - ), - np.array([p / q for p, q in zip(polys, qvals)], dtype=np.float64), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), + np.array([p / q for p, q in zip(polys, qvals, strict=False)], dtype=np.float64), x_label="log(x)", y_label="$\\log(c_q(x)) / q$", data_labels=[f"q = {q}" for q in qvals], @@ -2301,12 +2287,7 @@ def mfhurst_dm( if detrend: stepdata = detrend_data(stepdata, order=1, random_state=random_state) diffs = stepdata[1:] - stepdata[:-1] - hhcorr.append( - [ - np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) - for q in qvals - ] - ) + hhcorr.append([np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) for q in qvals]) hhcorr = np.array(hhcorr, dtype=np.float64) xvals = np.log(np.arange(1, max_max_dist + 1)) yvals = np.log(hhcorr) @@ -2315,9 +2296,7 @@ def mfhurst_dm( # ranges and does not introduce any new information. H = np.array( [ - poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit, random_state=random_state)[ - 0 - ] + poly_fit(xvals[:md], yvals[:md, qi], 1, fit=fit, random_state=random_state)[0] for qi in range(len(qvals)) for md in max_dists ], @@ -2330,9 +2309,7 @@ def mfhurst_dm( ) plot_reg_multiple( np.array([xvals] * len(qvals), dtype=np.float64), - np.array( - [yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64 - ), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), polys, x_label="log(x)", y_label="$\\log(c_q(x)) / q$", @@ -2549,13 +2526,9 @@ def corr_dim( # all sums are zero => we cannot fit a line poly = np.array([np.nan, np.nan], dtype=np.float64) else: - poly = poly_fit( - np.log(rvals), np.log(csums), 1, fit=fit, random_state=random_state - ) + poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit, random_state=random_state) if debug_plot: - plot_reg( - np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file - ) + plot_reg(np.log(rvals), np.log(csums), poly, "log(r)", "log(C(r))", fname=plot_file) if debug_data: return (poly[0], (np.log(rvals), np.log(csums), poly)) return poly[0] diff --git a/nolds/test_measures.py b/nolds/test_measures.py index f64dfa3..771a610 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -161,7 +161,7 @@ def logistic(x: float, r: float) -> float: """Logistic map.""" return r * x * (1 - x) - for r, s in zip(rvals, sign): + for r, s in zip(rvals, sign, strict=True): log = [] x = x0 for _ in range(100): From 3ee046ab491f3fc73b80d708d05bfe3df5b0159f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 18:19:52 +0200 Subject: [PATCH 32/36] removes redundant target version --- pyproject.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 581289a..a347800 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] readme = "README.rst" +# NOTE: This also informs ruff which Python version to target (no need for explicit target-version) requires-python = ">=3.10" dependencies = ["numpy>1.0,<3.0", "future>=1.0", "setuptools>=72.1.0"] @@ -62,9 +63,6 @@ build-backend = "hatchling.build" line-length = 100 # allow slightly longer lines indent-width = 4 -# Assume Python 3.8 -target-version = "py310" - [tool.ruff.format] # Like Black, use double quotes for strings. quote-style = "double" From 658cf774553f36ee05d7e21ff732a0f239cb0dd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 19:14:27 +0200 Subject: [PATCH 33/36] makes float and int precision configurable and consistent --- nolds/measures.py | 141 ++++++++++++++++++++++++----------------- nolds/test_measures.py | 4 +- 2 files changed, 84 insertions(+), 61 deletions(-) diff --git a/nolds/measures.py b/nolds/measures.py index 598e13c..5c89e2e 100644 --- a/nolds/measures.py +++ b/nolds/measures.py @@ -37,6 +37,25 @@ FloatArrayLike1D: TypeAlias = ArrayLike # 1D structure containing float values NumberArrayLike1D: TypeAlias = ArrayLike # 1D structure containing number values +float_precision = np.float64 +"""Default floating point precision used by nolds. + +Within nolds, this is considered static. However, downstream code might want to +change the precision for a specific measure to save time and space. + +This use case is not common enough to warrant an entire API around it, but +by introducing this variable we at least enable it in principle. +""" +int_precision = np.int32 +"""Default integer precision used by nolds. + +Within nolds, this is considered static. However, downstream code might want to +change the precision for a specific measure to save time and space. + +This use case is not common enough to warrant an entire API around it, but +by introducing this variable we at least enable it in principle. +""" + def rowwise_chebyshev(x: NumberArray2D, y: NumberArray1D) -> NumberArray1D: """Returns the Chebyshev distances between each row of matrix x and the reference row y.""" @@ -133,7 +152,7 @@ def delay_embedding(data: NumberArrayLike1D, emb_dim: int, lag: int = 1) -> Floa for i in 0 to m-1 (m = len(data)-(emb_dim-1)*lag) """ if not isinstance(data, np.ndarray): - data = np.asarray(data, dtype=np.float64) + data = np.asarray(data, dtype=float_precision) min_len = (emb_dim - 1) * lag + 1 if len(data) < min_len: msg = ( @@ -189,7 +208,7 @@ def lyap_r( plot_file: str | Path | None = None, fit_offset: int = 0, random_state: int | None = None, -) -> np.float64: ... +) -> float_precision: ... @overload @@ -209,7 +228,7 @@ def lyap_r( fit_offset: int = 0, random_state: int | None = None, ) -> tuple[ - np.float64, + float_precision, tuple[ IntArray1D, FloatArray1D, @@ -234,7 +253,7 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 fit_offset: int = 0, random_state: int | None = None, ) -> ( - np.float64 + float_precision | tuple[ float, tuple[ @@ -352,7 +371,7 @@ def lyap_r( # noqa: C901, PLR0912, PLR0915 - the line coefficients (`[slope, intercept]`). """ # convert data to float to avoid overflow errors in rowwise_euclidean - data = np.asarray(data, dtype=np.float64) + data = np.asarray(data, dtype=float_precision) n = len(data) max_tsep_factor = 0.25 if lag is None or min_tsep is None: @@ -426,7 +445,7 @@ def nb_neighbors(lag_value: int) -> int: orbit = delay_embedding(data, emb_dim, lag) m = len(orbit) # construct matrix with pairwise distances between vectors in orbit - dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)]) + dists = np.array([rowwise_euclidean(orbit, orbit[i]) for i in range(m)], dtype=float_precision) # we do not want to consider vectors as neighbor that are less than min_tsep # time steps together => mask the distances min_tsep to the right and left of # each index by setting them to infinity (will never be considered as nearest @@ -460,7 +479,7 @@ def nb_neighbors(lag_value: int) -> int: # build divergence trajectory by averaging distances along the trajectory # over all neighbor pairs - div_traj = np.zeros(trajectory_len, dtype=np.float64) + div_traj = np.zeros(trajectory_len, dtype=float_precision) for k in range(trajectory_len): # calculate mean trajectory distance at step k indices = (np.arange(ntraj) + k, nb_idx + k) @@ -480,7 +499,7 @@ def nb_neighbors(lag_value: int) -> int: if len(ks) < 1: # if all points or all but one point in the trajectory is -inf, we cannot # fit a line through the remaining points => return -inf as exponent - poly = np.array([-np.inf, 0], dtype=np.float64) + poly = np.array([-np.inf, 0], dtype=float_precision) else: # normal line fitting poly = poly_fit( @@ -492,7 +511,7 @@ def nb_neighbors(lag_value: int) -> int: ) if debug_plot: plot_reg( - ks[fit_offset:].astype(np.float64), + ks[fit_offset:].astype(float_precision), div_traj[fit_offset:], poly, "k", @@ -669,7 +688,7 @@ def lyap_e( # noqa: C901, PLR0915 iterations of R_i. The shape of this debug data is (x, matrix_dim). """ # convert to float to avoid errors when using 'inf' as distance - data = np.asarray(data, dtype=np.float64) + data = np.asarray(data, dtype=float_precision) n = len(data) if (emb_dim - 1) % (matrix_dim - 1) != 0: msg = "emb_dim - 1 must be divisible by matrix_dim - 1!" @@ -714,7 +733,7 @@ def lyap_e( # noqa: C901, PLR0915 ) raise ValueError(msg.format(min_nb - len(orbit), min_nb)) old_Q = np.identity(matrix_dim) - lexp = np.zeros(matrix_dim, dtype=np.float64) + lexp = np.zeros(matrix_dim, dtype=float_precision) lexp_counts = np.zeros(lexp.shape) debug_values = [] for i in range(len(orbit)): @@ -773,7 +792,7 @@ def lyap_e( # noqa: C901, PLR0915 # ... # note: emb_dim = (d_M - 1) * m + 1 # noqa: ERA001 - mat_X = np.array([data[j : j + emb_dim : m] for j in indices]) + mat_X = np.array([data[j : j + emb_dim : m] for j in indices], dtype=float_precision) mat_X -= data[i : i + emb_dim : m] # build vector beta for linear least squares @@ -817,7 +836,7 @@ def lyap_e( # noqa: C901, PLR0915 diag_R = np.diag(mat_R) # filter zeros in mat_R (would lead to -infs) idx = np.where(diag_R > 0) - lexp_i = np.zeros(diag_R.shape, dtype=np.float64) + lexp_i = np.zeros(diag_R.shape, dtype=float_precision) lexp_i[idx] = np.log(diag_R[idx]) lexp_i[np.where(diag_R == 0)] = np.inf if debug_plot or debug_data: @@ -828,7 +847,9 @@ def lyap_e( # noqa: C901, PLR0915 # it may happen that all R-matrices contained zeros => exponent really has # to be -inf if debug_plot: - plot_histogram_matrix(np.array(debug_values), "layp_e", fname=plot_file) + plot_histogram_matrix( + np.array(debug_values, dtype=float_precision), "layp_e", fname=plot_file + ) # normalize exponents over number of individual mat_Rs idx = np.where(lexp_counts > 0) lexp[idx] /= lexp_counts[idx] @@ -838,7 +859,7 @@ def lyap_e( # noqa: C901, PLR0915 # take m into account lexp /= m if debug_data: - return (lexp, np.array(debug_values)) + return (lexp, np.array(debug_values, dtype=float_precision)) return lexp @@ -1025,7 +1046,7 @@ def sampen( # noqa: C901, PLR0912 - [dists_m, dists_m1]: the distances between template vectors for m (dists_m) and for m + 1 (dists_m1). """ - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) if tolerance is None: # the reasoning behind this default value is the following: @@ -1258,7 +1279,7 @@ def expected_h( Returns: expected h for white noise """ - nvals = np.asarray(nvals, dtype=np.int32) + nvals = np.asarray(nvals, dtype=int_precision) rsvals = [expected_rs(n) for n in nvals] poly = poly_fit(np.log(nvals), np.log(rsvals), 1, fit=fit, random_state=random_state) return poly[0] @@ -1281,7 +1302,7 @@ def rs(data: FloatArray1D, n: np.integer, *, unbiased: bool = True) -> float: Returns: (R/S)_n """ - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) total_N = len(data) m = total_N // n # number of sequences # cut values at the end of data to make the array divisible by n @@ -1348,7 +1369,7 @@ def plot_histogram_matrix( rng = (float(mu - n * sigma), float(mu + n * sigma)) h, bins = np.histogram(data[:, i], nbins, rng) bin_width = bins[1] - bins[0] - h = h.astype(np.float64) / np.sum(h) + h = h.astype(float_precision) / np.sum(h) plt.bar(bins[:-1], h, bin_width) plt.axvline(float(np.mean(data[:, i])), color="red") plt.ylim(ylim) @@ -1713,7 +1734,7 @@ def hurst_rs( - rsvals: the corresponding (R/S)_n values - poly: the coefficients of the line fit (``[slope, intercept]`` """ - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) total_N = len(data) if nvals is None: # chooses a default value for nvals that will give 15 logarithmically @@ -1721,17 +1742,17 @@ def hurst_rs( # (since both too small and too large n introduce too much variance) nvals = logmid_n(total_N, ratio=1 / 4.0, nsteps=15) else: - nvals = np.array(nvals, dtype=np.int32) + nvals = np.array(nvals, dtype=int_precision) # get individual values for (R/S)_n - rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals]) + rsvals = np.array([rs(data, n, unbiased=unbiased) for n in nvals], dtype=float_precision) # filter NaNs (zeros should not be possible, because if R is 0 then # S is also zero) not_nan = np.logical_not(np.isnan(rsvals)) rsvals = rsvals[not_nan] - nvals = np.asarray(nvals)[not_nan] + nvals = np.asarray(nvals, dtype=int_precision)[not_nan] # it may happen that no rsvals are left (if all values of data are the same) if len(rsvals) == 0: - poly = np.array([np.nan, np.nan], dtype=np.float64) + poly = np.array([np.nan, np.nan], dtype=float_precision) if debug_plot: warnings.warn( "Cannot display debug plot, all (R/S)_n are NaN", @@ -1917,15 +1938,15 @@ def mfhurst_b( for each q in the shape (len(qvals), 2). """ # transform to array if necessary - data = np.asarray(data, dtype=np.float64) + data = np.asarray(data, dtype=float_precision) if qvals is None: # actual default parameter would introduce shared list # see: http://pylint-messages.wikidot.com/messages:w0102 qvals = [1] - qvals = np.asarray(qvals, dtype=np.float64) + qvals = np.asarray(qvals, dtype=float_precision) if dists is None: dists = logarithmic_n(1, np.ceil(max(20, 0.02 * len(data))), 1.5) - dists = np.asarray(dists, dtype=np.int32) + dists = np.asarray(dists, dtype=int_precision) min_reliable_n = 60 if len(data) < min_reliable_n: warnings.warn( @@ -1941,7 +1962,7 @@ def hhcorr(d: np.integer, q: np.floating) -> np.floating: # calculate height-height correlations corrvals = [hhcorr(d, q) for d in dists for q in qvals] - corrvals = np.array(corrvals, dtype=np.float64) + corrvals = np.array(corrvals, dtype=float_precision) corrvals = corrvals.reshape(len(dists), len(qvals)) # line fitting @@ -1952,14 +1973,14 @@ def hhcorr(d: np.integer, q: np.floating) -> np.floating: poly_fit(xvals, yvals[:, qi], 1, fit=fit, random_state=random_state) for qi in range(len(qvals)) ], - dtype=np.float64, + dtype=float_precision, ) - H = np.array(polys)[:, 0] / qvals + H = polys[:, 0] / qvals if debug_plot: plot_reg_multiple( - np.array([xvals] * len(qvals), dtype=np.float64), - np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), - np.array([p / q for p, q in zip(polys, qvals, strict=False)], dtype=np.float64), + np.array([xvals] * len(qvals), dtype=float_precision), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=float_precision), + np.array([p / q for p, q in zip(polys, qvals, strict=False)], dtype=float_precision), x_label="log(x)", y_label="$\\log(c_q(x)) / q$", data_labels=[f"q = {q}" for q in qvals], @@ -2029,7 +2050,7 @@ def _genhurst(S: FloatArray1D, q: float) -> float: dV = S[np.arange(tt, L, tt)] - S[np.arange(tt, L, tt) - tt] VV = S[np.arange(tt, L + tt, tt) - tt] N = len(dV) + 1 - X = np.arange(1, N + 1, dtype=np.float64) + X = np.arange(1, N + 1, dtype=float_precision) Y = VV mx = np.sum(X) / N SSxx = np.sum(X**2) - N * mx**2 @@ -2038,7 +2059,7 @@ def _genhurst(S: FloatArray1D, q: float) -> float: cc1 = SSxy / SSxx cc2 = my - cc1 * mx ddVd = dV - cc1 - VVVd = VV - np.multiply(cc1, np.arange(1, N + 1, dtype=np.float64)) - cc2 + VVVd = VV - np.multiply(cc1, np.arange(1, N + 1, dtype=float_precision)) - cc2 mcord[tt - 1] = np.mean(np.abs(ddVd) ** q) / np.mean(np.abs(VVVd) ** q) mx = np.mean(np.log10(x)) SSxx = np.sum(np.log10(x) ** 2) - Tmax * mx**2 @@ -2070,8 +2091,8 @@ def _aste_line_fit( results with a call to ``np.polyfit(x, y, 1)[::-1]``. """ # convert to float to avoid integer overflow problems - x = np.asarray(x, dtype=np.float64) - y = np.asarray(y, dtype=np.float64) + x = np.asarray(x, dtype=float_precision) + y = np.asarray(y, dtype=float_precision) N = len(x) mx = np.mean(x) my = np.mean(y) @@ -2257,15 +2278,15 @@ def mfhurst_dm( for each q in the shape (len(qvals), 2). """ # transform to array if necessary - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) if qvals is None: # actual default parameter would introduce shared list # see: http://pylint-messages.wikidot.com/messages:w0102 qvals = [1] - qvals = np.asarray(qvals, dtype=np.float64) + qvals = np.asarray(qvals, dtype=float_precision) if max_dists is None: max_dists = range(5, 20) - max_dists = np.asarray(max_dists, dtype=np.int32) + max_dists = np.asarray(max_dists, dtype=int_precision) min_reliable_n = 60 if len(data) < min_reliable_n: warnings.warn( @@ -2288,7 +2309,7 @@ def mfhurst_dm( stepdata = detrend_data(stepdata, order=1, random_state=random_state) diffs = stepdata[1:] - stepdata[:-1] hhcorr.append([np.mean(np.abs(diffs) ** q) / np.mean(np.abs(stepdata) ** q) for q in qvals]) - hhcorr = np.array(hhcorr, dtype=np.float64) + hhcorr = np.array(hhcorr, dtype=float_precision) xvals = np.log(np.arange(1, max_max_dist + 1)) yvals = np.log(hhcorr) # NOTE: Using several maximum distances seems to be a strange way to @@ -2300,16 +2321,16 @@ def mfhurst_dm( for qi in range(len(qvals)) for md in max_dists ], - dtype=np.float64, + dtype=float_precision, ).reshape(len(qvals), len(max_dists)) if debug_plot: polys = np.array( [poly_fit(xvals, yvals[:, qi], 1) / qvals[qi] for qi in range(len(qvals))], - dtype=np.float64, + dtype=float_precision, ) plot_reg_multiple( - np.array([xvals] * len(qvals), dtype=np.float64), - np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=np.float64), + np.array([xvals] * len(qvals), dtype=float_precision), + np.array([yvals[:, qi] / qvals[qi] for qi in range(len(qvals))], dtype=float_precision), polys, x_label="log(x)", y_label="$\\log(c_q(x)) / q$", @@ -2487,17 +2508,17 @@ def corr_dim( - poly: the line coefficients (``[slope, intercept]``) """ # TODO: determine lag in units of time instead of number of datapoints - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) # TODO: what are good values for r? # TODO: do this for multiple values of emb_dim? if rvals is None: sd = float(np.std(data, ddof=1)) rvals = logarithmic_r(0.1 * sd, 0.5 * sd, 1.03) - rvals = np.asarray(rvals, dtype=np.float64) + rvals = np.asarray(rvals, dtype=float_precision) orbit = delay_embedding(data, emb_dim, lag=lag) n = len(orbit) - dists = np.zeros((len(orbit), len(orbit)), dtype=np.float64) + dists = np.zeros((len(orbit), len(orbit)), dtype=float_precision) for i in range(len(orbit)): # calculate distances between X_i and X_i+1, X_i+2, ... , X_n-1 # NOTE: strictly speaking, [cd_1] does not specify to exclude self-matches @@ -2517,14 +2538,14 @@ def corr_dim( # also exclude self-matches from the denominator. s = 1.0 / (n * (n - 1)) * np.sum(dists <= r) csums.append(s) - csums = np.array(csums) + csums = np.array(csums, dtype=float_precision) # filter zeros from csums nonzero = np.where(csums != 0) - rvals = np.array(rvals)[nonzero] + rvals = rvals[nonzero] csums = csums[nonzero] if len(csums) == 0: # all sums are zero => we cannot fit a line - poly = np.array([np.nan, np.nan], dtype=np.float64) + poly = np.array([np.nan, np.nan], dtype=float_precision) else: poly = poly_fit(np.log(rvals), np.log(csums), 1, fit=fit, random_state=random_state) if debug_plot: @@ -2760,7 +2781,7 @@ def dfa( # noqa: C901, PLR0912, PLR0915 - fluctuations: the corresponding log(std(X,n)) - poly: the line coefficients (``[slope, intercept]``) """ - data = np.asarray(data) + data = np.asarray(data, dtype=float_precision) total_N = len(data) if nvals is None: min_n_for_log_scale = 70 @@ -2773,7 +2794,7 @@ def dfa( # noqa: C901, PLR0912, PLR0915 nvals = [total_N - 2, total_N - 1] msg = "choosing nvals = {} , DFA with less than ten data points is extremely unreliable" warnings.warn(msg.format(nvals), RuntimeWarning, stacklevel=2) - nvals = np.asarray(nvals, dtype=np.int32) + nvals = np.asarray(nvals, dtype=int_precision) min_number_of_nvals = 2 min_nval = 2 if nvals.shape[0] < min_number_of_nvals: @@ -2794,7 +2815,9 @@ def dfa( # noqa: C901, PLR0912, PLR0915 # subdivide data into chunks of size n if overlap: # step size n/2 instead of n - d = np.array([walk[i : i + n] for i in range(0, len(walk) - n, n // 2)]) + d = np.array( + [walk[i : i + n] for i in range(0, len(walk) - n, n // 2)], dtype=float_precision + ) else: # non-overlapping windows => we can simply do a reshape d = walk[: total_N - (total_N % n)] @@ -2805,8 +2828,8 @@ def dfa( # noqa: C901, PLR0912, PLR0915 poly_fit(x, d[i], order, fit=fit_trend, random_state=random_state) for i in range(len(d)) ] - tpoly = np.array(tpoly) - trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))]) + tpoly = np.array(tpoly, dtype=float_precision) + trend = np.array([np.polyval(tpoly[i], x) for i in range(len(d))], dtype=float_precision) # calculate mean-square differences for each walk in d around trend flucs = np.sum((d - trend) ** 2, axis=1) / n # take another mean across all walks and finally take the square root of that @@ -2815,14 +2838,14 @@ def dfa( # noqa: C901, PLR0912, PLR0915 # windows and the last window matches the end of the data perfectly. f_n = np.sqrt(np.sum(flucs) / len(flucs)) fluctuations.append(f_n) - fluctuations = np.array(fluctuations) + fluctuations = np.array(fluctuations, dtype=float_precision) # filter zeros from fluctuations nonzero = np.where(fluctuations != 0) - nvals = np.array(nvals)[nonzero] + nvals = nvals[nonzero] fluctuations = fluctuations[nonzero] if len(fluctuations) == 0: # all fluctuations are zero => we cannot fit a line - poly = np.array([np.nan, np.nan], dtype=np.float64) + poly = np.array([np.nan, np.nan], dtype=float_precision) else: poly = poly_fit( np.log(nvals), diff --git a/nolds/test_measures.py b/nolds/test_measures.py index 771a610..ab2a580 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -767,7 +767,7 @@ def test_corr_dim(self) -> None: dist=measures.rowwise_euclidean, fit="poly", ) - self.assertAlmostEqual(1.303252839255068, cd, places=14) + self.assertAlmostEqual(0.0810185360746645, cd, places=14) @unittest.skipUnless(SCIPY_AVAILABLE, "Tests with RANSAC require scipy.") def test_corr_dim_RANSAC(self) -> None: # noqa: N802 @@ -787,7 +787,7 @@ def test_corr_dim_RANSAC(self) -> None: # noqa: N802 fit="RANSAC", random_state=42, ) - self.assertAlmostEqual(0.44745494643404665, cd, places=14) + self.assertAlmostEqual(0.0008971209283844629, cd, places=14) def test_lyap_e(self) -> None: """Hypothesis: The exact output of lyap_e remains unchanged. From 1e95be41c2043e503daca09e443ed5bb5072aa97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 19:43:28 +0200 Subject: [PATCH 34/36] updates changelog --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c7e243b..1c3e93d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## [Unreleased] ### Added + +* Type hints for the complete API. +* Documentation for test hypotheses. + ### Changed * Switches from using `setup.py` to `pyproject.toml` using `uv`. @@ -15,10 +19,15 @@ and this project adheres to [Semantic Versioning](http://semver.org/). * Datasets that are available as global variables are loaded lazily now. * `datasets.qrandom` output now has a more accurate dtype of `np.uint16`. * Uses `importlib.resources.files` instead of deprecated `pkg_resources.resource_stream`. +* Applies dtype `float64` or `int32` in all internal conversions using `asarray` or `array`. * Applies Ruff formatting throughout the codebase and addresses all linting errors. +* Switches from two spaces to four spaces for indentation. +* Supported range of Python versions is now 3.10–3.13. Updated CI accordingly. ### Fixed +* Swaps deprecated `pkg_resources` for `importlib.resources`. + ## [0.6.2] ### Fixed From ff5c88d63c18521706d7692bd971e1cd4f186b95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 19:47:14 +0200 Subject: [PATCH 35/36] fixes typo Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- nolds/test_measures.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nolds/test_measures.py b/nolds/test_measures.py index ab2a580..e33b6b9 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -95,7 +95,7 @@ def test_delay_embed_lag3(self) -> None: self.assert_array_equal(expected, embedded) def test_delay_embed_empty(self) -> None: - """Hpoythesis: An error is raised when settings would lead to an empty orbit vector list.""" + """Hypothesis: An error is raised when settings would lead to an empty orbit vector list.""" data = np.arange(10, dtype=np.float64) try: embedded = measures.delay_embedding(data, 11) From 0fbbb8dca436be02a538d545770c60e1e862350c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20Sch=C3=B6lzel?= Date: Sun, 31 Aug 2025 19:53:55 +0200 Subject: [PATCH 36/36] fixes typos and unnecessary code --- nolds/datasets.py | 4 ++-- nolds/test_measures.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nolds/datasets.py b/nolds/datasets.py index b07a032..6dc37d0 100644 --- a/nolds/datasets.py +++ b/nolds/datasets.py @@ -35,7 +35,7 @@ def lorenz_euler( length: Number of data points to generate. sigma: Sigma parameter of the Lorenz system. rho: Rho parameter of the Lorenz system. - beta: Beta paramete rof the Lorenz system. + beta: Beta parameter of the Lorenz system. dt: Time delta between two data points. start: Optional starting point for the trajectory. @@ -77,7 +77,7 @@ def lorenz_lyap(sigma: float, rho: float, beta: float) -> float: Args: sigma: Sigma parameter of the Lorenz system. rho: Rho parameter of the Lorenz system. - beta: Beta paramete rof the Lorenz system. + beta: Beta parameter of the Lorenz system. Returns: Prescribed Lyapunov dimension for the Lorenz system according to Leonov 2015. diff --git a/nolds/test_measures.py b/nolds/test_measures.py index e33b6b9..b2b30b0 100644 --- a/nolds/test_measures.py +++ b/nolds/test_measures.py @@ -168,10 +168,10 @@ def logistic(x: float, r: float) -> float: x = logistic(x, r) log.append(x) log = np.array(log, dtype=np.float64) - with self.subTest(meashure="lyap_e", r=r): + with self.subTest(measure="lyap_e", r=r): le = np.max(measures.lyap_e(log, emb_dim=6, matrix_dim=2)) self.assertEqual(s, np.sign(le)) - with self.subTest(meashure="lyap_r", r=r): + with self.subTest(measure="lyap_r", r=r): lr = measures.lyap_r(log, emb_dim=6, lag=2, min_tsep=10, trajectory_len=20) self.assertEqual(s, np.sign(lr)) @@ -479,7 +479,6 @@ def test_dfa_positive_correlation(self) -> None: h_walk = measures.dfa(self.positive_correlation) # expected h is around 1.0 self.assertGreater(h_walk, 0.7) - assert h_walk > 0.7 def test_dfa_fbm(self) -> None: """Hypothesis: H ~= h + 1 for fractional brownian motion with Hurst parameter h."""