From 1998ce38a95597edf410104890fc98b71027bab1 Mon Sep 17 00:00:00 2001 From: Steven Maude Date: Tue, 12 May 2026 15:59:04 +0100 Subject: [PATCH] Remove `extract-stats` command Fixes #394. Reasoning: * It's already been deprecated a while. * It was only for use with `cohort-extractor` logs, and that has now been marked as unsupported since June 2025; reference: https://github.com/opensafely-core/cohort-extractor/commit/bb89f630edc13d420934a15d6d86a360859e50ef --- opensafely/__init__.py | 2 - opensafely/extract_stats.py | 226 ------------------ .../metadata/executor_api_logs/action1.log | 27 --- .../metadata/executor_api_logs/action2.log | 33 --- .../metadata/executor_api_logs/action3.log | 15 -- .../metadata/executor_api_logs/non-action.log | 4 - .../metadata/old_style_logs/action1.log | 29 --- .../metadata/old_style_logs/action2.log | 35 --- .../metadata/old_style_logs/action3.log | 16 -- .../metadata/old_style_logs/non-action.log | 4 - tests/test_extract_stats.py | 96 -------- 11 files changed, 487 deletions(-) delete mode 100755 opensafely/extract_stats.py delete mode 100644 tests/fixtures/metadata/executor_api_logs/action1.log delete mode 100644 tests/fixtures/metadata/executor_api_logs/action2.log delete mode 100644 tests/fixtures/metadata/executor_api_logs/action3.log delete mode 100644 tests/fixtures/metadata/executor_api_logs/non-action.log delete mode 100644 tests/fixtures/metadata/old_style_logs/action1.log delete mode 100644 tests/fixtures/metadata/old_style_logs/action2.log delete mode 100644 tests/fixtures/metadata/old_style_logs/action3.log delete mode 100644 tests/fixtures/metadata/old_style_logs/non-action.log delete mode 100644 tests/test_extract_stats.py diff --git a/opensafely/__init__.py b/opensafely/__init__.py index fd6fc647..67f64bf4 100644 --- a/opensafely/__init__.py +++ b/opensafely/__init__.py @@ -11,7 +11,6 @@ clean, codelists, execute, - extract_stats, info, jupyter, launch, @@ -150,7 +149,6 @@ def add_subcommand(cmd, module): add_subcommand("pull", pull) add_subcommand("upgrade", upgrade) add_subcommand("unzip", unzip) - add_subcommand("extract-stats", extract_stats) add_subcommand("info", info) add_subcommand("exec", execute) add_subcommand("clean", clean) diff --git a/opensafely/extract_stats.py b/opensafely/extract_stats.py deleted file mode 100755 index 4307fbc0..00000000 --- a/opensafely/extract_stats.py +++ /dev/null @@ -1,226 +0,0 @@ -#!/usr/bin/python3 -import json -import re -from argparse import ArgumentParser -from collections import Counter -from datetime import datetime -from pathlib import Path - - -DESCRIPTION = "Parse and output cohortextractor-stats logs as JSONL" -TIMESTAMP_PREFIX_REGEX = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{9}Z") -COMPLETED_PREFIX_REGEX = re.compile(r"^Completed successfully|outputs:") -KEY_VALUE_REGEX = re.compile(r"(?<=\s)([^\s=]+)=(.*?)(?=(?:\s[^\s=]+=|$))") -ACTION_SUMMARY_REGEX = re.compile( - r"^(state|exit_code|docker_image_id|job_id|job_request_id|run_by_user|created_at|completed_at|local_run):\s(.*)" -) - - -def add_arguments(parser): - parser.add_argument( - "--project-dir", - help="Project directory (default: current directory)", - type=Path, - default=".", - ) - parser.add_argument( - "--project-name", - help="Project name (default: project-dir)", - type=str, - ) - parser.add_argument( - "--output-file", - help="Output json filename (default: extracted_stats.json)", - type=Path, - default="extracted_stats.json", - ) - return parser - - -def _timestamp_for_honeytail(ts_datetime): - honeytail_ts_format = "%Y-%m-%dT%H:%M:%S.000000000Z" - return ts_datetime.strftime(honeytail_ts_format) - - -def parse_log(project_name, current_action, job_id, job_request_id, current_log): - current_log_timestamp = TIMESTAMP_PREFIX_REGEX.match(current_log).group() - current_log = re.sub(r"\s*\n\s*", " ", current_log).strip() - log_params = dict(KEY_VALUE_REGEX.findall(current_log)) - return { - "timestamp": current_log_timestamp, - "project": project_name, - "job_id": job_id, - "job_request_id": job_request_id, - "action": current_action, - **log_params, - } - - -def parse_stats_logs(raw_logs, project_name, current_action, job_id, job_request_id): - for log in raw_logs: - yield parse_log(project_name, current_action, job_id, job_request_id, log) - - -def _parse_summary_datetime(datetime_string, ts_format): - """ - Created/completed in the log summary may be a stingified int (unix time) - or ISO-8061 UTC format - Convert the string to a datetime object and return both the datetime object and - its ISO-8061 UTC string - """ - try: - datetime_obj = datetime.fromtimestamp(int(datetime_string)) - return datetime_obj, datetime_obj.strftime(ts_format) - except ValueError: - return datetime.strptime(datetime_string, ts_format), datetime_string - - -def format_summary_stats(project_name, current_action, summary_stats): - start_time = summary_stats.get("created_at") - end_time = summary_stats.get("completed_at") - - iso_utc_format = "%Y-%m-%dT%H:%M:%SZ" - - start_dt, start_time_formatted = _parse_summary_datetime(start_time, iso_utc_format) - summary_stats["created_at"] = start_time_formatted - if start_time and end_time: - end_dt, end_time_formatted = _parse_summary_datetime(end_time, iso_utc_format) - elapsed = end_dt - start_dt - # prefix summary elapsed time to make filtering easier - summary_stats["action_elapsed_time_secs"] = elapsed.seconds - summary_stats["action_elapsed_time"] = str(elapsed) - summary_stats["completed_at"] = end_time_formatted - - return { - # add the timestamp field in the format that honeytail requires - "timestamp": _timestamp_for_honeytail(start_dt), - "project": project_name, - "action": current_action, - **{k: v for k, v in summary_stats.items()}, - } - - -def add_action_counts_by_job_request(summary_stats_list): - job_request_counts = Counter( - [ - summary.get("job_request_id") - for summary in summary_stats_list - if "job_request_id" in summary - ] - ) - # Include the total number of actions run alongside this one in each action summary - summary_stats_with_action_counts = [] - for summary in summary_stats_list: - job_request_id = summary.get("job_request_id") - action_count = job_request_counts[job_request_id] if job_request_id else 0 - summary_stats_with_action_counts.append( - {**summary, "total_actions_in_job_request": action_count} - ) - return summary_stats_with_action_counts - - -def main(project_dir, output_file, project_name=None): - """ - Read all log files from the project's metadata folder and extract stats logs. - - Cohort-extractor actions log stats with the event "cohortextractor-stats", - which is used to idenitfy stats logs, and any number of key-value pairs of - stats data in the format key=value. Logs can span multiple lines, and values in - a key=value pair can contain spaces. - - In addition, each log file contains a section with overall job summary data, which - is extracted into a single additional log, and other non-stats logs, which are ignored. - - Output is in JSONL format, one log per line. - """ - log_dir = project_dir / "metadata" - project_name = project_name or project_dir.resolve().name - - log_files = list(log_dir.glob("*.log")) - - stats_logs = [] - summary_stats_logs = [] - - for filep in log_files: - current_action = filep.stem - summary_stats = {} - raw_logs = [] - # logs in the log file can span multiple lines, and are a mixture of stats logs, - # which we want to extract, and other logs, which we mostly don't care about. - # `extracted_log` keeps track of the log that's being parsed as we interate over the log - # file line-by-line, collecting the stats logs into the `raw_logs` list - - extracted_log = None - for line in filep.open().readlines(): - # check if this line is the beginning of a new log or the beginning of the end-of-file - # action summary - if TIMESTAMP_PREFIX_REGEX.match(line) or COMPLETED_PREFIX_REGEX.match(line): - # We're at the beginning of a new log - # If we were in the process of extracting a log, we're finished with it now - # Add it to the list of raw logs to be processed later - # (extracted_log can be None if we're at the beginning of the file, or if - # the previous line wasn't a stats log) - if extracted_log is not None: - raw_logs.append(extracted_log) - # Now reset the extracted_log... - if "cohortextractor-stats" in line: - # ...to the current line if it's a stats log - extracted_log = line - else: - # ...to None if it's a standard non-stats log - extracted_log = None - elif ACTION_SUMMARY_REGEX.match(line): - # Check for the summary stats lines - summary_stats.update(dict(ACTION_SUMMARY_REGEX.findall(line))) - elif extracted_log is not None: - # this line isn't a log start, and we're in the process of extracting a log, - # so it must be a continuation line. Append it to the log we're extracting. - extracted_log += line - # Add the final extracted log, if there is one - if extracted_log is not None: - raw_logs.append(extracted_log) - - # At the end of each processed file, we parse the raw logs into a dict - # and add them to the full list that will be written to file - if raw_logs: - stats_logs.extend( - parse_stats_logs( - raw_logs, - project_name, - current_action, - summary_stats.get("job_id"), - summary_stats.get("job_request_id"), - ) - ) - - # Format and add the summary stats - # Check for job_id in case we encountered other, non-job logs in the metadata folder - if summary_stats.get("job_id"): - summary_stats_logs.append( - format_summary_stats(project_name, current_action, summary_stats) - ) - - # Now that all log files are processed, find the action acounts by - # job request - summary_stats_logs = add_action_counts_by_job_request(summary_stats_logs) - - with (log_dir / output_file).open("w") as outpath: - for log in [*summary_stats_logs, *stats_logs]: - json.dump(log, outpath) - outpath.write("\n") - print(f"Log stats written to {log_dir / output_file}") - - -def run(): - parser = ArgumentParser(description=DESCRIPTION) - parser = add_arguments(parser) - args = parser.parse_args() - main( - project_dir=args.project_dir / "metadata", - output_file=args.output_file, - project_name=args.project_name, - ) - - -if __name__ == "__main__": - run() diff --git a/tests/fixtures/metadata/executor_api_logs/action1.log b/tests/fixtures/metadata/executor_api_logs/action1.log deleted file mode 100644 index 032f79e6..00000000 --- a/tests/fixtures/metadata/executor_api_logs/action1.log +++ /dev/null @@ -1,27 +0,0 @@ -2022-04-25T11:37:20.751678216Z 2022-04-25 11:37:20 [info ] Generating cohort for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:20.832847251Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 -2022-04-25T11:37:20.832875666Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] variable_count=32 -2022-04-25T11:37:20.832946062Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] variables_using_codelist_count=23 -2022-04-25T11:37:20.833012480Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] codelist_size=50 variable_using_codelist=learning_disability -2022-04-25T11:37:20.833090409Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] codelist_size=79 variable_using_codelist=medication_review -2022-04-25T11:37:20.833090412Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] description=A description with spaces sql=SELECT * FROM -TABLE - with - line - breaks -2022-04-25T11:37:20.835160414Z 2022-04-25 11:37:20 [info ] Setting index_date to 2021-04-01 [cohortextractor.cohortextractor] -2022-04-25T11:37:21.133637972Z 2022-04-25 11:37:21 [info ] Successfully created cohort and covariates at output/input_2021-04-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:21.133714541Z 2022-04-25 11:37:21 [info ] cohortextractor-stats [cohortextractor.cohortextractor] description=generate_cohort for study_definition at 2021-04-01 execution_time=0:00:00.282108 execution_time_secs=0.28210839499999985 -2022-04-25T11:37:28.537687970Z 2022-04-25 11:37:28 [info ] cohortextractor-stats [cohortextractor.cohortextractor] description=generate_cohort for study_definition (all index dates) execution_time=0:00:07.801681 execution_time_secs=7.801681266000001 - - -job_id: 3e6blwbtbxgpm6ji -job_request_id: 1234 -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec -exit_code: 0 -created_at: 1650883036 -completed_at: 1650883052 -local_run: True - -outputs: - output/input_2019-04-01.feather - highly_sensitive diff --git a/tests/fixtures/metadata/executor_api_logs/action2.log b/tests/fixtures/metadata/executor_api_logs/action2.log deleted file mode 100644 index 730bfbce..00000000 --- a/tests/fixtures/metadata/executor_api_logs/action2.log +++ /dev/null @@ -1,33 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 -2022-04-25T11:37:53.605144893Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] variable_count=32 -2022-04-25T11:37:53.608226064Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] measures_count=2 -2022-04-25T11:37:53.609617608Z 2022-04-25 11:37:53 [info ] Calculating measures for output/input_2019-12-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:53.609752917Z 2022-04-25 11:37:53 [info ] Calculating systolic_bp [cohortextractor.cohortextractor] -2022-04-25T11:37:53.609889694Z 2022-04-25 11:37:53 [info ] Loading patient data from output/input_2019-12-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:53.698733081Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=patient_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=41214 -2022-04-25T11:37:53.698892232Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Load patient dataframe for measures execution_time=0:00:00.078953 execution_time_secs=0.07895265700000031 input_file=output/input_2019-12-01.feather -2022-04-25T11:37:53.707543608Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.007607 execution_time_secs=0.007606717999999901 measure_id=systolic_bp -2022-04-25T11:37:53.708836721Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=3509 -2022-04-25T11:37:53.711864471Z 2022-04-25 11:37:53 [info ] Created measure output at output/measure_systolic_bp_2019-12-01.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:53.711955357Z 2022-04-25 11:37:53 [info ] Calculating systolic_bp_practice_only [cohortextractor.cohortextractor] -2022-04-25T11:37:53.716617612Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.004357 execution_time_secs=0.004357261000000001 measure_id=systolic_bp_practice_only -2022-04-25T11:37:53.717850127Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp_practice_only memory=1120 -2022-04-25T11:37:53.719066327Z 2022-04-25 11:37:53 [info ] Created measure output at output/measure_systolic_bp_practice_only_2019-12-01.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:58.169074136Z 2022-04-25 11:37:58 [info ] Combined measure output for all dates in output/measure_systolic_bp.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:58.171443532Z 2022-04-25 11:37:58 [info ] Combined measure output for all dates in output/measure_systolic_bp_practice_only.csv [cohortextractor.cohortextractor] - - -job_id: yf5dm6ekrw7cv5vo -job_request_id: 1234 -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec -exit_code: 0 -created_at: 1650883036 -completed_at: 1650883052 -local_run: True - -outputs: - output/measure_systolic_bp.csv - moderately_sensitive - output/measure_systolic_bp_2019-01-01.csv - moderately_sensitive - output/measure_systolic_bp_practice_only.csv - moderately_sensitive - output/measure_systolic_bp_practice_only_2019-01-01.csv - moderately_sensitive diff --git a/tests/fixtures/metadata/executor_api_logs/action3.log b/tests/fixtures/metadata/executor_api_logs/action3.log deleted file mode 100644 index 9a5896b8..00000000 --- a/tests/fixtures/metadata/executor_api_logs/action3.log +++ /dev/null @@ -1,15 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 - - -job_id: 123 -job_request_id: 5678 -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9de -exit_code: 0 -created_at: 1650883036 -completed_at: 1650883052 -local_run: True - - -outputs: - output/measure_systolic_bp.csv - moderately_sensitive diff --git a/tests/fixtures/metadata/executor_api_logs/non-action.log b/tests/fixtures/metadata/executor_api_logs/non-action.log deleted file mode 100644 index 9deb3173..00000000 --- a/tests/fixtures/metadata/executor_api_logs/non-action.log +++ /dev/null @@ -1,4 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] - - - output/measure_systolic_bp.csv - moderately_sensitive diff --git a/tests/fixtures/metadata/old_style_logs/action1.log b/tests/fixtures/metadata/old_style_logs/action1.log deleted file mode 100644 index a94fe16a..00000000 --- a/tests/fixtures/metadata/old_style_logs/action1.log +++ /dev/null @@ -1,29 +0,0 @@ -2022-04-25T11:37:20.751678216Z 2022-04-25 11:37:20 [info ] Generating cohort for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:20.832847251Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 -2022-04-25T11:37:20.832875666Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] variable_count=32 -2022-04-25T11:37:20.832946062Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] variables_using_codelist_count=23 -2022-04-25T11:37:20.833012480Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] codelist_size=50 variable_using_codelist=learning_disability -2022-04-25T11:37:20.833090409Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] codelist_size=79 variable_using_codelist=medication_review -2022-04-25T11:37:20.833090412Z 2022-04-25 11:37:20 [info ] cohortextractor-stats [cohortextractor.log_utils] description=A description with spaces sql=SELECT * FROM -TABLE - with - line - breaks -2022-04-25T11:37:20.835160414Z 2022-04-25 11:37:20 [info ] Setting index_date to 2021-04-01 [cohortextractor.cohortextractor] -2022-04-25T11:37:21.133637972Z 2022-04-25 11:37:21 [info ] Successfully created cohort and covariates at output/input_2021-04-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:21.133714541Z 2022-04-25 11:37:21 [info ] cohortextractor-stats [cohortextractor.cohortextractor] description=generate_cohort for study_definition at 2021-04-01 execution_time=0:00:00.282108 execution_time_secs=0.28210839499999985 -2022-04-25T11:37:28.537687970Z 2022-04-25 11:37:28 [info ] cohortextractor-stats [cohortextractor.cohortextractor] description=generate_cohort for study_definition (all index dates) execution_time=0:00:07.801681 execution_time_secs=7.801681266000001 - - -state: succeeded -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec -job_id: 3e6blwbtbxgpm6ji -job_request_id: 1234 -run_by_user: becky -created_at: 2022-04-25T11:37:16Z -completed_at: 2022-04-25T11:37:32Z - -Completed successfully - -outputs: - output/input_2019-04-01.feather - highly_sensitive diff --git a/tests/fixtures/metadata/old_style_logs/action2.log b/tests/fixtures/metadata/old_style_logs/action2.log deleted file mode 100644 index 465017df..00000000 --- a/tests/fixtures/metadata/old_style_logs/action2.log +++ /dev/null @@ -1,35 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 -2022-04-25T11:37:53.605144893Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] variable_count=32 -2022-04-25T11:37:53.608226064Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] measures_count=2 -2022-04-25T11:37:53.609617608Z 2022-04-25 11:37:53 [info ] Calculating measures for output/input_2019-12-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:53.609752917Z 2022-04-25 11:37:53 [info ] Calculating systolic_bp [cohortextractor.cohortextractor] -2022-04-25T11:37:53.609889694Z 2022-04-25 11:37:53 [info ] Loading patient data from output/input_2019-12-01.feather [cohortextractor.cohortextractor] -2022-04-25T11:37:53.698733081Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=patient_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=41214 -2022-04-25T11:37:53.698892232Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Load patient dataframe for measures execution_time=0:00:00.078953 execution_time_secs=0.07895265700000031 input_file=output/input_2019-12-01.feather -2022-04-25T11:37:53.707543608Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.007607 execution_time_secs=0.007606717999999901 measure_id=systolic_bp -2022-04-25T11:37:53.708836721Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp memory=3509 -2022-04-25T11:37:53.711864471Z 2022-04-25 11:37:53 [info ] Created measure output at output/measure_systolic_bp_2019-12-01.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:53.711955357Z 2022-04-25 11:37:53 [info ] Calculating systolic_bp_practice_only [cohortextractor.cohortextractor] -2022-04-25T11:37:53.716617612Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] date=datetime.date(2019, 12, 1) description=Calculate measure execution_time=0:00:00.004357 execution_time_secs=0.004357261000000001 measure_id=systolic_bp_practice_only -2022-04-25T11:37:53.717850127Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.cohortextractor] dataframe=measure_df date=datetime.date(2019, 12, 1) measure_id=systolic_bp_practice_only memory=1120 -2022-04-25T11:37:53.719066327Z 2022-04-25 11:37:53 [info ] Created measure output at output/measure_systolic_bp_practice_only_2019-12-01.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:58.169074136Z 2022-04-25 11:37:58 [info ] Combined measure output for all dates in output/measure_systolic_bp.csv [cohortextractor.cohortextractor] -2022-04-25T11:37:58.171443532Z 2022-04-25 11:37:58 [info ] Combined measure output for all dates in output/measure_systolic_bp_practice_only.csv [cohortextractor.cohortextractor] - - -state: succeeded -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9ec -job_id: yf5dm6ekrw7cv5vo -job_request_id: 1234 -run_by_user: becky -created_at: 2022-04-25T11:37:16Z -completed_at: 2022-04-25T11:37:32Z - -Completed successfully - -outputs: - output/measure_systolic_bp.csv - moderately_sensitive - output/measure_systolic_bp_2019-01-01.csv - moderately_sensitive - output/measure_systolic_bp_practice_only.csv - moderately_sensitive - output/measure_systolic_bp_practice_only_2019-01-01.csv - moderately_sensitive diff --git a/tests/fixtures/metadata/old_style_logs/action3.log b/tests/fixtures/metadata/old_style_logs/action3.log deleted file mode 100644 index 4cb946d0..00000000 --- a/tests/fixtures/metadata/old_style_logs/action3.log +++ /dev/null @@ -1,16 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] -2022-04-25T11:37:53.605072735Z 2022-04-25 11:37:53 [info ] cohortextractor-stats [cohortextractor.log_utils] output_column_count=33 table_count=30 table_joins_count=30 - - -state: succeeded -docker_image_id: sha256:a7b427365c8d7f93004671ce91cf91f703ed3da74c80007c755254c651ebb9de -job_id: 123 -job_request_id: 5678 -run_by_user: becky -created_at: 2022-04-25T11:37:16Z -completed_at: 2022-04-25T11:37:32Z - -Completed successfully - -outputs: - output/measure_systolic_bp.csv - moderately_sensitive diff --git a/tests/fixtures/metadata/old_style_logs/non-action.log b/tests/fixtures/metadata/old_style_logs/non-action.log deleted file mode 100644 index 9deb3173..00000000 --- a/tests/fixtures/metadata/old_style_logs/non-action.log +++ /dev/null @@ -1,4 +0,0 @@ -2022-04-25T11:37:53.516753154Z 2022-04-25 11:37:53 [info ] Generating measure for study_definition in output [cohortextractor.cohortextractor] - - - output/measure_systolic_bp.csv - moderately_sensitive diff --git a/tests/test_extract_stats.py b/tests/test_extract_stats.py deleted file mode 100644 index 9f2d796a..00000000 --- a/tests/test_extract_stats.py +++ /dev/null @@ -1,96 +0,0 @@ -import json -import shutil -from pathlib import Path - -import pytest - -from opensafely import extract_stats - - -def add_metadata_to_project_path(tmp_path, fixture_folder): - fixture_path = Path(__file__).parent / "fixtures" / "metadata" / fixture_folder - shutil.copytree(fixture_path, tmp_path / "metadata") - return tmp_path - - -@pytest.mark.parametrize( - "fixture_folder,unique_summary_keys", - [ - ("old_style_logs", ["state", "run_by_user"]), - # Jobs run via the executor api log slightly different fields and format to - # old-style jobs - ("executor_api_logs", ["exit_code", "local_run"]), - ], -) -def test_log_stats(tmp_path, fixture_folder, unique_summary_keys): - project_path = add_metadata_to_project_path(tmp_path, fixture_folder) - extract_stats.main(project_path, "stats.json") - stats_output_path = project_path / "metadata" / "stats.json" - assert stats_output_path.exists() - - stats_json = [json.loads(line) for line in stats_output_path.open().readlines()] - # sort logs by action, with summary log at th - stats_json.sort(key=lambda x: (x["action"], x.get("docker_image_id", ""))) - # action 1 and 2 are part of one job request - # action 1 has 8 stats logs plus one summary log - # action 2 has 9 stats logs plus one summary log - # action 3 is a different job request - # action 3 has 1 stats log plus one summary log - # non-action.log is processed, but has no stats log or summary section - - assert len(stats_json) == 9 + 10 + 2 - summary_logs = [stats_json[8], stats_json[18], stats_json[-1]] - action1_logs = stats_json[0:8] - action2_logs = stats_json[10:18] - action3_logs = stats_json[19:-1] - - # logs from jobs run the old way and with the new executor-api have some commong - # keys, and a couple that are different - common_summary_keys = [ - "timestamp", - "project", - "action", - "docker_image_id", - "job_id", - "job_request_id", - "created_at", - "completed_at", - "action_elapsed_time_secs", - "action_elapsed_time", - "total_actions_in_job_request", - ] - summary_keys = common_summary_keys + unique_summary_keys - - for summary_log in summary_logs: - # First item in each action log list is the summary - # It has the expected keys - assert set(summary_log.keys()) == set(summary_keys) - # elapsed time is calculated from created/completed time and reported in - # seconds and human-readable H:M:S - assert summary_log["action_elapsed_time_secs"] == 16 - assert summary_log["action_elapsed_time"] == "0:00:16" - for summary_log in summary_logs[:1]: - assert summary_log["total_actions_in_job_request"] == 2 - assert summary_log["job_request_id"] == "1234" - assert summary_logs[2]["total_actions_in_job_request"] == 1 - assert summary_logs[2]["job_request_id"] == "5678" - - # all other logs have the job id, project etc added - def _assert_keys(log): - for key in ["job_id", "job_request_id", "timestamp", "project", "action"]: - assert key in log - - for statslog in action1_logs: - _assert_keys(statslog) - assert statslog["job_id"] == "3e6blwbtbxgpm6ji" - assert statslog["job_request_id"] == "1234" - - for statslog in action2_logs: - _assert_keys(statslog) - assert statslog["job_id"] == "yf5dm6ekrw7cv5vo" - assert statslog["job_request_id"] == "1234" - - for statslog in action3_logs: - _assert_keys(statslog) - assert statslog["job_id"] == "123" - assert statslog["job_request_id"] == "5678"