From 1998ce38a95597edf410104890fc98b71027bab1 Mon Sep 17 00:00:00 2001
From: Steven Maude <git@stevenmaude.co.uk>
Date: Tue, 12 May 2026 15:59:04 +0100
Subject: [PATCH] Remove `extract-stats` command

Fixes #394.

Reasoning:

* It's already been deprecated a while.
* It was only for use with `cohort-extractor` logs, and that has now
  been marked as unsupported since June 2025;
  reference: https://github.com/opensafely-core/cohort-extractor/commit/bb89f630edc13d420934a15d6d86a360859e50ef
---
 opensafely/__init__.py                        |   2 -
 opensafely/extract_stats.py                   | 226 ------------------
 .../metadata/executor_api_logs/action1.log    |  27 ---
 .../metadata/executor_api_logs/action2.log    |  33 ---
 .../metadata/executor_api_logs/action3.log    |  15 --
 .../metadata/executor_api_logs/non-action.log |   4 -
 .../metadata/old_style_logs/action1.log       |  29 ---
 .../metadata/old_style_logs/action2.log       |  35 ---
 .../metadata/old_style_logs/action3.log       |  16 --
 .../metadata/old_style_logs/non-action.log    |   4 -
 tests/test_extract_stats.py                   |  96 --------
 11 files changed, 487 deletions(-)
 delete mode 100755 opensafely/extract_stats.py
 delete mode 100644 tests/fixtures/metadata/executor_api_logs/action1.log
 delete mode 100644 tests/fixtures/metadata/executor_api_logs/action2.log
 delete mode 100644 tests/fixtures/metadata/executor_api_logs/action3.log
 delete mode 100644 tests/fixtures/metadata/executor_api_logs/non-action.log
 delete mode 100644 tests/fixtures/metadata/old_style_logs/action1.log
 delete mode 100644 tests/fixtures/metadata/old_style_logs/action2.log
 delete mode 100644 tests/fixtures/metadata/old_style_logs/action3.log
 delete mode 100644 tests/fixtures/metadata/old_style_logs/non-action.log
 delete mode 100644 tests/test_extract_stats.py

diff --git a/opensafely/__init__.py b/opensafely/__init__.py
index fd6fc647..67f64bf4 100644
--- a/opensafely/__init__.py
+++ b/opensafely/__init__.py
@@ -11,7 +11,6 @@
     clean,
     codelists,
     execute,
-    extract_stats,
     info,
     jupyter,
     launch,
@@ -150,7 +149,6 @@ def add_subcommand(cmd, module):
     add_subcommand("pull", pull)
     add_subcommand("upgrade", upgrade)
     add_subcommand("unzip", unzip)
-    add_subcommand("extract-stats", extract_stats)
     add_subcommand("info", info)
     add_subcommand("exec", execute)
     add_subcommand("clean", clean)
diff --git a/opensafely/extract_stats.py b/opensafely/extract_stats.py
deleted file mode 100755
index 4307fbc0..00000000
--- a/opensafely/extract_stats.py
+++ /dev/null
@@ -1,226 +0,0 @@
-#!/usr/bin/python3
-import json
-import re
-from argparse import ArgumentParser
-from collections import Counter
-from datetime import datetime
-from pathlib import Path
-
-
-DESCRIPTION = "Parse and output cohortextractor-stats logs as JSONL"
-TIMESTAMP_PREFIX_REGEX = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{9}Z")
-COMPLETED_PREFIX_REGEX = re.compile(r"^Completed successfully|outputs:")
-KEY_VALUE_REGEX = re.compile(r"(?<=\s)([^\s=]+)=(.*?)(?=(?:\s[^\s=]+=|$))")
-ACTION_SUMMARY_REGEX = re.compile(
-    r"^(state|exit_code|docker_image_id|job_id|job_request_id|run_by_user|created_at|completed_at|local_run):\s(.*)"
-)
-
-
-def add_arguments(parser):
-    parser.add_argument(
-        "--project-dir",
-        help="Project directory (default: current directory)",
-        type=Path,
-        default=".",
-    )
-    parser.add_argument(
-        "--project-name",
-        help="Project name (default: project-dir)",
-        type=str,
-    )
-    parser.add_argument(
-        "--output-file",
-        help="Output json filename (default: extracted_stats.json)",
-        type=Path,
-        default="extracted_stats.json",
-    )
-    return parser
-
-
-def _timestamp_for_honeytail(ts_datetime):
-    honeytail_ts_format = "%Y-%m-%dT%H:%M:%S.000000000Z"
-    return ts_datetime.strftime(honeytail_ts_format)
-
-
-def parse_log(project_name, current_action, job_id, job_request_id, current_log):
-    current_log_timestamp = TIMESTAMP_PREFIX_REGEX.match(current_log).group()
-    current_log = re.sub(r"\s*\n\s*", " ", current_log).strip()
-    log_params = dict(KEY_VALUE_REGEX.findall(current_log))
-    return {
-        "timestamp": current_log_timestamp,
-        "project": project_name,
-        "job_id": job_id,
-        "job_request_id": job_request_id,
-        "action": current_action,
-        **log_params,
-    }
-
-
-def parse_stats_logs(raw_logs, project_name, current_action, job_id, job_request_id):
-    for log in raw_logs:
-        yield parse_log(project_name, current_action, job_id, job_request_id, log)
-
-
-def _parse_summary_datetime(datetime_string, ts_format):
-    """
-    Created/completed in the log summary may be a stingified int (unix time)
-    or ISO-8061 UTC format
-    Convert the string to a datetime object and return both the datetime object and
-    its ISO-8061 UTC string
-    """
-    try:
-        datetime_obj = datetime.fromtimestamp(int(datetime_string))
-        return datetime_obj, datetime_obj.strftime(ts_format)
-    except ValueError:
-        return datetime.strptime(datetime_string, ts_format), datetime_string
-
-
-def format_summary_stats(project_name, current_action, summary_stats):
-    start_time = summary_stats.get("created_at")
-    end_time = summary_stats.get("completed_at")
-
-    iso_utc_format = "%Y-%m-%dT%H:%M:%SZ"
-
-    start_dt, start_time_formatted = _parse_summary_datetime(start_time, iso_utc_format)
-    summary_stats["created_at"] = start_time_formatted
-    if start_time and end_time:
-        end_dt, end_time_formatted = _parse_summary_datetime(end_time, iso_utc_format)
-        elapsed = end_dt - start_dt
-        # prefix summary elapsed time to make filtering easier
-        summary_stats["action_elapsed_time_secs"] = elapsed.seconds
-        summary_stats["action_elapsed_time"] = str(elapsed)
-        summary_stats["completed_at"] = end_time_formatted
-
-    return {
-        # add the timestamp field in the format that honeytail requires
-        "timestamp": _timestamp_for_honeytail(start_dt),
-        "project": project_name,
-        "action": current_action,
-        **{k: v for k, v in summary_stats.items()},
-    }
-
-
-def add_action_counts_by_job_request(summary_stats_list):
-    job_request_counts = Counter(
-        [
-            summary.get("job_request_id")
-            for summary in summary_stats_list
-            if "job_request_id" in summary
-        ]
-    )
-    # Include the total number of actions run alongside this one in each action summary
-    summary_stats_with_action_counts = []
-    for summary in summary_stats_list:
-        job_request_id = summary.get("job_request_id")
-        action_count = job_request_counts[job_request_id] if job_request_id else 0
-        summary_stats_with_action_counts.append(
-            {**summary, "total_actions_in_job_request": action_count}
-        )
-    return summary_stats_with_action_counts
-
-
-def main(project_dir, output_file, project_name=None):
-    """
-    Read all log files from the project's metadata folder and extract stats logs.
-
-    Cohort-extractor actions log stats with the event "cohortextractor-stats",
-    which is used to idenitfy stats logs, and any number of key-value pairs of
-    stats data in the format key=value.  Logs can span multiple lines, and values in
-    a key=value pair can contain spaces.
-
-    In addition, each log file contains a section with overall job summary data, which
-    is extracted into a single additional log, and other non-stats logs, which are ignored.
-
-    Output is in JSONL format, one log per line.
-    """
-    log_dir = project_dir / "metadata"
-    project_name = project_name or project_dir.resolve().name
-
-    log_files = list(log_dir.glob("*.log"))
-
-    stats_logs = []
-    summary_stats_logs = []
-
-    for filep in log_files:
-        current_action = filep.stem
-        summary_stats = {}
-        raw_logs = []
-        # logs in the log file can span multiple lines, and are a mixture of stats logs,
-        # which we want to extract, and other logs, which we mostly don't care about.
-        # `extracted_log` keeps track of the log that's being parsed as we interate over the log
-        # file line-by-line, collecting the stats logs into the `raw_logs` list
-
-        extracted_log = None
-        for line in filep.open().readlines():
-            # check if this line is the beginning of a new log or the beginning of the end-of-file
-            # action summary
-            if TIMESTAMP_PREFIX_REGEX.match(line) or COMPLETED_PREFIX_REGEX.match(line):
-                # We're at the beginning of a new log
-                # If we were in the process of extracting a log, we're finished with it now
-                # Add it to the list of raw logs to be processed later
-                # (extracted_log can be None if we're at the beginning of the file, or if
-                # the previous line wasn't a stats log)
-                if extracted_log is not None:
-                    raw_logs.append(extracted_log)
-                # Now reset the extracted_log...
-                if "cohortextractor-stats" in line:
-                    # ...to the current line if it's a stats log
-                    extracted_log = line
-                else:
-                    # ...to None if it's a standard non-stats log
-                    extracted_log = None
-            elif ACTION_SUMMARY_REGEX.match(line):
-                # Check for the summary stats lines
-                summary_stats.update(dict(ACTION_SUMMARY_REGEX.findall(line)))
-            elif extracted_log is not None:
-                # this line isn't a log start, and we're in the process of extracting a log,
-                # so it must be a continuation line. Append it to the log we're extracting.
-                extracted_log += line
-        # Add the final extracted log, if there is one
-        if extracted_log is not None:
-            raw_logs.append(extracted_log)
-
-        # At the end of each processed file, we parse the raw logs into a dict
-        # and add them to the full list that will be written to file
-        if raw_logs:
-            stats_logs.extend(
-                parse_stats_logs(
-                    raw_logs,
-                    project_name,
-                    current_action,
-                    summary_stats.get("job_id"),
-                    summary_stats.get("job_request_id"),
-                )
-            )
-
-        # Format and add the summary stats
-        # Check for job_id in case we encountered other, non-job logs in the metadata folder
-        if summary_stats.get("job_id"):
-            summary_stats_logs.append(
-                format_summary_stats(project_name, current_action, summary_stats)
-            )
-
-    # Now that all log files are processed, find the action acounts by
-    # job request
-    summary_stats_logs = add_action_counts_by_job_request(summary_stats_logs)
-
-    with (log_dir / output_file).open("w") as outpath:
-        for log in [*summary_stats_logs, *stats_logs]:
-            json.dump(log, outpath)
-            outpath.write("\n")
-    print(f"Log stats written to {log_dir / output_file}")
-
-
-def run():
-    parser = ArgumentParser(description=DESCRIPTION)
-    parser = add_arguments(parser)
-    args = parser.parse_args()
-    main(
-        project_dir=args.project_dir / "metadata",
-        output_file=args.output_file,
-        project_name=args.project_name,
-    )
-
-
-if __name__ == "__main__":
-    run()
diff --git a/tests/fixtures/metadata/executor_api_logs/action1.log b/tests/fixtures/metadata/executor_api_logs/action1.log
deleted file mode 100644
index 032f79e6..00000000
--- a/tests/fixtures/metadata/executor_api_logs/action1.log
+++ /dev/null
@@ -1,27 +0,0 @@
-2022-04-25T11:37:20.751678216Z 2022-04-25 11:37:20 [info     ] Generating cohort for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:20.832847251Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-2022-04-25T11:37:20.832875666Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variable_count=32
-2022-04-25T11:37:20.832946062Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variables_using_codelist_count=23
-2022-04-25T11:37:20.833012480Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] codelist_size=50 variable_using_codelist=learning_disability
-2022-04-25T11:37:20.833090409Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] codelist_size=79 variable_using_codelist=medication_review
-2022-04-25T11:37:20.833090412Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] description=A description with spaces sql=SELECT * FROM 
-TABLE 
-    with 
-    line 
-    breaks
-2022-04-25T11:37:20.835160414Z 2022-04-25 11:37:20 [info     ] Setting index_date to 2021-04-01 [cohortextractor.cohortextractor] 
-2022-04-25T11:37:21.133637972Z 2022-04-25 11:37:21 [info     ] Successfully created cohort and covariates at output/input_2021-04-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:21.133714541Z 2022-04-25 11:37:21 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] description=generate_cohort for study_definition at 2021-04-01 execution_time=0:00:00.282108 execution_time_secs=0.28210839499999985
-2022-04-25T11:37:28.537687970Z 2022-04-25 11:37:28 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] description=generate_cohort for study_definition (all index dates) execution_time=0:00:07.801681 execution_time_secs=7.801681266000001
-
-
-job_id: 3e6blwbtbxgpm6ji
-job_request_id: 1234
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec
-exit_code: 0
-created_at: 1650883036
-completed_at: 1650883052
-local_run: True
-
-outputs:
-  output/input_2019-04-01.feather  - highly_sensitive
diff --git a/tests/fixtures/metadata/executor_api_logs/action2.log b/tests/fixtures/metadata/executor_api_logs/action2.log
deleted file mode 100644
index 730bfbce..00000000
--- a/tests/fixtures/metadata/executor_api_logs/action2.log
+++ /dev/null
@@ -1,33 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-2022-04-25T11:37:53.605144893Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variable_count=32
-2022-04-25T11:37:53.608226064Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] measures_count=2
-2022-04-25T11:37:53.609617608Z 2022-04-25 11:37:53 [info     ] Calculating measures for output/input_2019-12-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.609752917Z 2022-04-25 11:37:53 [info     ] Calculating systolic_bp        [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.609889694Z 2022-04-25 11:37:53 [info     ] Loading patient data from output/input_2019-12-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.698733081Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=patient_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=41214
-2022-04-25T11:37:53.698892232Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Load patient dataframe for measures execution_time=0:00:00.078953 execution_time_secs=0.07895265700000031 input_file=output/input_2019-12-01.feather
-2022-04-25T11:37:53.707543608Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.007607 execution_time_secs=0.007606717999999901 measure_id=systolic_bp
-2022-04-25T11:37:53.708836721Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=3509
-2022-04-25T11:37:53.711864471Z 2022-04-25 11:37:53 [info     ] Created measure output at output/measure_systolic_bp_2019-12-01.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.711955357Z 2022-04-25 11:37:53 [info     ] Calculating systolic_bp_practice_only [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.716617612Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.004357 execution_time_secs=0.004357261000000001 measure_id=systolic_bp_practice_only
-2022-04-25T11:37:53.717850127Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp_practice_only memory=1120
-2022-04-25T11:37:53.719066327Z 2022-04-25 11:37:53 [info     ] Created measure output at output/measure_systolic_bp_practice_only_2019-12-01.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:58.169074136Z 2022-04-25 11:37:58 [info     ] Combined measure output for all dates in output/measure_systolic_bp.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:58.171443532Z 2022-04-25 11:37:58 [info     ] Combined measure output for all dates in output/measure_systolic_bp_practice_only.csv [cohortextractor.cohortextractor] 
-
-
-job_id: yf5dm6ekrw7cv5vo
-job_request_id: 1234
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec
-exit_code: 0
-created_at: 1650883036
-completed_at: 1650883052
-local_run: True
-
-outputs:
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
-  output/measure_systolic_bp_2019-01-01.csv                              - moderately_sensitive
-  output/measure_systolic_bp_practice_only.csv                                         - moderately_sensitive
-  output/measure_systolic_bp_practice_only_2019-01-01.csv                              - moderately_sensitive
diff --git a/tests/fixtures/metadata/executor_api_logs/action3.log b/tests/fixtures/metadata/executor_api_logs/action3.log
deleted file mode 100644
index 9a5896b8..00000000
--- a/tests/fixtures/metadata/executor_api_logs/action3.log
+++ /dev/null
@@ -1,15 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-
-
-job_id: 123
-job_request_id: 5678
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9de
-exit_code: 0
-created_at: 1650883036
-completed_at: 1650883052
-local_run: True
-
-
-outputs:
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
diff --git a/tests/fixtures/metadata/executor_api_logs/non-action.log b/tests/fixtures/metadata/executor_api_logs/non-action.log
deleted file mode 100644
index 9deb3173..00000000
--- a/tests/fixtures/metadata/executor_api_logs/non-action.log
+++ /dev/null
@@ -1,4 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-
-
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
diff --git a/tests/fixtures/metadata/old_style_logs/action1.log b/tests/fixtures/metadata/old_style_logs/action1.log
deleted file mode 100644
index a94fe16a..00000000
--- a/tests/fixtures/metadata/old_style_logs/action1.log
+++ /dev/null
@@ -1,29 +0,0 @@
-2022-04-25T11:37:20.751678216Z 2022-04-25 11:37:20 [info     ] Generating cohort for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:20.832847251Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-2022-04-25T11:37:20.832875666Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variable_count=32
-2022-04-25T11:37:20.832946062Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variables_using_codelist_count=23
-2022-04-25T11:37:20.833012480Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] codelist_size=50 variable_using_codelist=learning_disability
-2022-04-25T11:37:20.833090409Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] codelist_size=79 variable_using_codelist=medication_review
-2022-04-25T11:37:20.833090412Z 2022-04-25 11:37:20 [info     ] cohortextractor-stats          [cohortextractor.log_utils] description=A description with spaces sql=SELECT * FROM 
-TABLE 
-    with 
-    line 
-    breaks
-2022-04-25T11:37:20.835160414Z 2022-04-25 11:37:20 [info     ] Setting index_date to 2021-04-01 [cohortextractor.cohortextractor] 
-2022-04-25T11:37:21.133637972Z 2022-04-25 11:37:21 [info     ] Successfully created cohort and covariates at output/input_2021-04-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:21.133714541Z 2022-04-25 11:37:21 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] description=generate_cohort for study_definition at 2021-04-01 execution_time=0:00:00.282108 execution_time_secs=0.28210839499999985
-2022-04-25T11:37:28.537687970Z 2022-04-25 11:37:28 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] description=generate_cohort for study_definition (all index dates) execution_time=0:00:07.801681 execution_time_secs=7.801681266000001
-
-
-state: succeeded
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec
-job_id: 3e6blwbtbxgpm6ji
-job_request_id: 1234
-run_by_user: becky
-created_at: 2022-04-25T11:37:16Z
-completed_at: 2022-04-25T11:37:32Z
-
-Completed successfully
-
-outputs:
-  output/input_2019-04-01.feather  - highly_sensitive
diff --git a/tests/fixtures/metadata/old_style_logs/action2.log b/tests/fixtures/metadata/old_style_logs/action2.log
deleted file mode 100644
index 465017df..00000000
--- a/tests/fixtures/metadata/old_style_logs/action2.log
+++ /dev/null
@@ -1,35 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-2022-04-25T11:37:53.605144893Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] variable_count=32
-2022-04-25T11:37:53.608226064Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] measures_count=2
-2022-04-25T11:37:53.609617608Z 2022-04-25 11:37:53 [info     ] Calculating measures for output/input_2019-12-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.609752917Z 2022-04-25 11:37:53 [info     ] Calculating systolic_bp        [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.609889694Z 2022-04-25 11:37:53 [info     ] Loading patient data from output/input_2019-12-01.feather [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.698733081Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=patient_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=41214
-2022-04-25T11:37:53.698892232Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Load patient dataframe for measures execution_time=0:00:00.078953 execution_time_secs=0.07895265700000031 input_file=output/input_2019-12-01.feather
-2022-04-25T11:37:53.707543608Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.007607 execution_time_secs=0.007606717999999901 measure_id=systolic_bp
-2022-04-25T11:37:53.708836721Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=3509
-2022-04-25T11:37:53.711864471Z 2022-04-25 11:37:53 [info     ] Created measure output at output/measure_systolic_bp_2019-12-01.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.711955357Z 2022-04-25 11:37:53 [info     ] Calculating systolic_bp_practice_only [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.716617612Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.004357 execution_time_secs=0.004357261000000001 measure_id=systolic_bp_practice_only
-2022-04-25T11:37:53.717850127Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp_practice_only memory=1120
-2022-04-25T11:37:53.719066327Z 2022-04-25 11:37:53 [info     ] Created measure output at output/measure_systolic_bp_practice_only_2019-12-01.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:58.169074136Z 2022-04-25 11:37:58 [info     ] Combined measure output for all dates in output/measure_systolic_bp.csv [cohortextractor.cohortextractor] 
-2022-04-25T11:37:58.171443532Z 2022-04-25 11:37:58 [info     ] Combined measure output for all dates in output/measure_systolic_bp_practice_only.csv [cohortextractor.cohortextractor] 
-
-
-state: succeeded
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec
-job_id: yf5dm6ekrw7cv5vo
-job_request_id: 1234
-run_by_user: becky
-created_at: 2022-04-25T11:37:16Z
-completed_at: 2022-04-25T11:37:32Z
-
-Completed successfully
-
-outputs:
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
-  output/measure_systolic_bp_2019-01-01.csv                              - moderately_sensitive
-  output/measure_systolic_bp_practice_only.csv                                         - moderately_sensitive
-  output/measure_systolic_bp_practice_only_2019-01-01.csv                              - moderately_sensitive
diff --git a/tests/fixtures/metadata/old_style_logs/action3.log b/tests/fixtures/metadata/old_style_logs/action3.log
deleted file mode 100644
index 4cb946d0..00000000
--- a/tests/fixtures/metadata/old_style_logs/action3.log
+++ /dev/null
@@ -1,16 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info     ] cohortextractor-stats          [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30
-
-
-state: succeeded
-docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9de
-job_id: 123
-job_request_id: 5678
-run_by_user: becky
-created_at: 2022-04-25T11:37:16Z
-completed_at: 2022-04-25T11:37:32Z
-
-Completed successfully
-
-outputs:
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
diff --git a/tests/fixtures/metadata/old_style_logs/non-action.log b/tests/fixtures/metadata/old_style_logs/non-action.log
deleted file mode 100644
index 9deb3173..00000000
--- a/tests/fixtures/metadata/old_style_logs/non-action.log
+++ /dev/null
@@ -1,4 +0,0 @@
-2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info     ] Generating measure for study_definition in output [cohortextractor.cohortextractor] 
-
-
-  output/measure_systolic_bp.csv                                         - moderately_sensitive
diff --git a/tests/test_extract_stats.py b/tests/test_extract_stats.py
deleted file mode 100644
index 9f2d796a..00000000
--- a/tests/test_extract_stats.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import json
-import shutil
-from pathlib import Path
-
-import pytest
-
-from opensafely import extract_stats
-
-
-def add_metadata_to_project_path(tmp_path, fixture_folder):
-    fixture_path = Path(__file__).parent / "fixtures" / "metadata" / fixture_folder
-    shutil.copytree(fixture_path, tmp_path / "metadata")
-    return tmp_path
-
-
-@pytest.mark.parametrize(
-    "fixture_folder,unique_summary_keys",
-    [
-        ("old_style_logs", ["state", "run_by_user"]),
-        # Jobs run via the executor api log slightly different fields and format to
-        # old-style jobs
-        ("executor_api_logs", ["exit_code", "local_run"]),
-    ],
-)
-def test_log_stats(tmp_path, fixture_folder, unique_summary_keys):
-    project_path = add_metadata_to_project_path(tmp_path, fixture_folder)
-    extract_stats.main(project_path, "stats.json")
-    stats_output_path = project_path / "metadata" / "stats.json"
-    assert stats_output_path.exists()
-
-    stats_json = [json.loads(line) for line in stats_output_path.open().readlines()]
-    # sort logs by action, with summary log at th
-    stats_json.sort(key=lambda x: (x["action"], x.get("docker_image_id", "")))
-    # action 1 and 2 are part of one job request
-    # action 1 has 8 stats logs plus one summary log
-    # action 2 has 9 stats logs plus one summary log
-    # action 3 is a different job request
-    # action 3 has 1 stats log plus one summary log
-    # non-action.log is processed, but has no stats log or summary section
-
-    assert len(stats_json) == 9 + 10 + 2
-    summary_logs = [stats_json[8], stats_json[18], stats_json[-1]]
-    action1_logs = stats_json[0:8]
-    action2_logs = stats_json[10:18]
-    action3_logs = stats_json[19:-1]
-
-    # logs from jobs run the old way and with the new executor-api have some commong
-    # keys, and a couple that are different
-    common_summary_keys = [
-        "timestamp",
-        "project",
-        "action",
-        "docker_image_id",
-        "job_id",
-        "job_request_id",
-        "created_at",
-        "completed_at",
-        "action_elapsed_time_secs",
-        "action_elapsed_time",
-        "total_actions_in_job_request",
-    ]
-    summary_keys = common_summary_keys + unique_summary_keys
-
-    for summary_log in summary_logs:
-        # First item in each action log list is the summary
-        # It has the expected keys
-        assert set(summary_log.keys()) == set(summary_keys)
-        # elapsed time is calculated from created/completed time and reported in
-        # seconds and human-readable H:M:S
-        assert summary_log["action_elapsed_time_secs"] == 16
-        assert summary_log["action_elapsed_time"] == "0:00:16"
-    for summary_log in summary_logs[:1]:
-        assert summary_log["total_actions_in_job_request"] == 2
-        assert summary_log["job_request_id"] == "1234"
-    assert summary_logs[2]["total_actions_in_job_request"] == 1
-    assert summary_logs[2]["job_request_id"] == "5678"
-
-    # all other logs have the job id, project etc added
-    def _assert_keys(log):
-        for key in ["job_id", "job_request_id", "timestamp", "project", "action"]:
-            assert key in log
-
-    for statslog in action1_logs:
-        _assert_keys(statslog)
-        assert statslog["job_id"] == "3e6blwbtbxgpm6ji"
-        assert statslog["job_request_id"] == "1234"
-
-    for statslog in action2_logs:
-        _assert_keys(statslog)
-        assert statslog["job_id"] == "yf5dm6ekrw7cv5vo"
-        assert statslog["job_request_id"] == "1234"
-
-    for statslog in action3_logs:
-        _assert_keys(statslog)
-        assert statslog["job_id"] == "123"
-        assert statslog["job_request_id"] == "5678"