diff --git a/dvc.lock b/dvc.lock index c1ab873..a94607e 100644 --- a/dvc.lock +++ b/dvc.lock @@ -54,7 +54,7 @@ stages: md5: f021afa12f9d7c893412a0b2980ab187.dir size: 104535557 nfiles: 13 - filter_sql: + filter_wikipedia_sql: cmd: filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-page.sql.gz data/filtered/wikidata_titles.txt -o data/filtered/OneZoom_enwiki-latest-page.sql @@ -83,12 +83,13 @@ stages: md5: 87ff995e9d5028efc185857f34448746.dir size: 587064765 nfiles: 3 - add_ott_numbers: + add_ott_numbers_to_trees: cmd: - rm -rf data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 - mkdir -p data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 - add_ott_numbers_to_trees --savein - data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 + data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1 --output_info + data/add_ott_numbers_to_trees.log data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] deps: - path: data/OZTreeBuild/AllLife/BespokeTree/include_noAutoOTT/ @@ -103,23 +104,20 @@ stages: outs: - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ hash: md5 - md5: 3981ca3104e6e42846f0dca4dae932d4.dir - size: 1805911 - nfiles: 56 - prepare_open_trees: + md5: cfe57e6fbd3572028ac2d83203a96fe4.dir + size: 1534894 + nfiles: 55 + get_open_trees_from_one_zoom: cmd: - - mkdir -p data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all - - cp -n data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/*.nwk - data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ 2>/dev/null || true - cd data/OZTreeBuild/AllLife && get_open_trees_from_one_zoom ../../OpenTree/v16.1/draftversion.tre OpenTreeParts/OpenTree_all/ BespokeTree/include_OT_v16.1/*.PHY deps: - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ hash: md5 - md5: 3981ca3104e6e42846f0dca4dae932d4.dir - size: 1805911 - nfiles: 56 + md5: cfe57e6fbd3572028ac2d83203a96fe4.dir + size: 1534894 + nfiles: 55 - path: data/OZTreeBuild/AllLife/OpenTreeParts/OT_required/ hash: md5 md5: 81be05fde561126fb58b7bb7e8a0fbcd.dir @@ -136,9 +134,9 @@ stages: outs: - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ hash: md5 - md5: 3f8df65320201c2db0ee35b17916c7cb.dir - size: 81928129 - nfiles: 229 + md5: ed1333f8ed0fc9f2ca4315ce9cb3cc42.dir + size: 81927748 + nfiles: 227 download_eol: cmd: - curl -L -D data/EOL/.headers -o data/EOL/provider_ids.csv.gz @@ -174,7 +172,7 @@ stages: hash: md5 md5: f7c9bb8374957c07168bec36d6591347 size: 221682224 - build_tree: + build_oz_tree: cmd: - cd data/OZTreeBuild/AllLife && build_oz_tree BespokeTree/include_OT_v16.1/Base.PHY OpenTreeParts/OpenTree_all/ @@ -182,14 +180,14 @@ stages: deps: - path: data/OZTreeBuild/AllLife/BespokeTree/include_OT_v16.1/ hash: md5 - md5: 3981ca3104e6e42846f0dca4dae932d4.dir - size: 1805911 - nfiles: 56 + md5: cfe57e6fbd3572028ac2d83203a96fe4.dir + size: 1534894 + nfiles: 55 - path: data/OZTreeBuild/AllLife/OpenTreeParts/OpenTree_all/ hash: md5 - md5: 3f8df65320201c2db0ee35b17916c7cb.dir - size: 81928129 - nfiles: 229 + md5: ed1333f8ed0fc9f2ca4315ce9cb3cc42.dir + size: 81927748 + nfiles: 227 params: params.yaml: ot_version: v16.1 @@ -199,7 +197,7 @@ stages: hash: md5 md5: 0b17680b0a0a633f8ae50e4a8f68f17a size: 83061022 - create_tables: + CSV_base_table_creator: cmd: - mkdir -p data/output_files - CSV_base_table_creator data/OZTreeBuild/AllLife/AllLife_full_tree.phy @@ -210,7 +208,7 @@ stages: --version 28017344 --exclude Archosauria_ott335588 Dinosauria_ott90215 --extra_source_file data/OZTreeBuild/AllLife/BespokeTree/SupplementaryTaxonomy.tsv 2> - data/output_files/ordered_output.log + data/CSV_base_table_creator.log deps: - path: data/OZTreeBuild/AllLife/AllLife_full_tree.phy hash: md5 @@ -251,26 +249,26 @@ stages: outs: - path: data/output_files/ hash: md5 - md5: 80b98cbf1167374064c0fdf6f4cec926.dir - size: 1182677604 - nfiles: 8 - make_js: + md5: 81dfa9c3747bbe94c57161cc8e22d5c9.dir + size: 1158499655 + nfiles: 7 + make_js_treefiles: cmd: - mkdir -p data/js_output - make_js_treefiles --outdir data/js_output deps: - path: data/output_files/ hash: md5 - md5: 80b98cbf1167374064c0fdf6f4cec926.dir - size: 1182677604 - nfiles: 8 + md5: 81dfa9c3747bbe94c57161cc8e22d5c9.dir + size: 1158499655 + nfiles: 7 outs: - path: data/js_output/ hash: md5 - md5: b6808a7ee61a8b566f757ed5dca39a46.dir - size: 8293094 + md5: 9782411b79a64d6da09afacae2f3f047.dir + size: 8297912 nfiles: 6 - discover_wikidata_url: + discover_latest_wikidata_dump_url: cmd: discover_latest_wikidata_dump_url > data/Wiki/wd_JSON/latest-all-json-bz2-url.txt outs: @@ -278,7 +276,7 @@ stages: hash: md5 md5: e094b0f57c0c14e1016842c2dac5482e size: 90 - discover_enwiki_sql_url: + discover_latest_enwiki_sql_url: cmd: discover_latest_enwiki_sql_url > data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt outs: diff --git a/dvc.yaml b/dvc.yaml index fcccbfe..6e2a357 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -10,13 +10,14 @@ stages: - data/OpenTree/${ot_version}/ # ~20 secs - add_ott_numbers: + add_ott_numbers_to_trees: cmd: - rm -rf data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} - mkdir -p data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} - >- add_ott_numbers_to_trees --savein data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version} + --output_info data/add_ott_numbers_to_trees.log data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] deps: - data/OZTreeBuild/${oz_tree}/BespokeTree/include_noAutoOTT/ @@ -27,10 +28,8 @@ stages: - data/OZTreeBuild/${oz_tree}/BespokeTree/include_OT_${ot_version}/ # ~a few secs - prepare_open_trees: + get_open_trees_from_one_zoom: cmd: - - mkdir -p data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all - - cp -n data/OZTreeBuild/${oz_tree}/OpenTreeParts/OT_required/*.nwk data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ 2>/dev/null || true - >- cd data/OZTreeBuild/${oz_tree} && get_open_trees_from_one_zoom @@ -47,7 +46,7 @@ stages: outs: - data/OZTreeBuild/${oz_tree}/OpenTreeParts/OpenTree_all/ - build_tree: + build_oz_tree: cmd: - >- cd data/OZTreeBuild/${oz_tree} && @@ -92,7 +91,7 @@ stages: outs: - data/filtered/OneZoom_provider_ids.csv - discover_wikidata_url: + discover_latest_wikidata_dump_url: cmd: >- discover_latest_wikidata_dump_url > data/Wiki/wd_JSON/latest-all-json-bz2-url.txt outs: @@ -119,7 +118,7 @@ stages: outs: - data/filtered/wikidata_titles.txt - discover_enwiki_sql_url: + discover_latest_enwiki_sql_url: cmd: >- discover_latest_enwiki_sql_url > data/Wiki/wp_SQL/enwiki-page-sql-gz-url.txt outs: @@ -134,7 +133,7 @@ stages: outs: - data/Wiki/wp_SQL/enwiki-page.sql.gz - filter_sql: + filter_wikipedia_sql: cmd: >- filter_wikipedia_sql data/Wiki/wp_SQL/enwiki-page.sql.gz @@ -160,7 +159,7 @@ stages: persist: true # ~10 mins - create_tables: + CSV_base_table_creator: cmd: - mkdir -p data/output_files - >- @@ -175,7 +174,7 @@ stages: --version ${build_version} --exclude ${exclude_from_popularity} --extra_source_file data/OZTreeBuild/${oz_tree}/BespokeTree/SupplementaryTaxonomy.tsv - 2> data/output_files/ordered_output.log + 2> data/CSV_base_table_creator.log deps: - data/OZTreeBuild/${oz_tree}/${oz_tree}_full_tree.phy - data/OpenTree/${ot_version}/taxonomy.tsv @@ -192,7 +191,7 @@ stages: outs: - data/output_files/ - make_js: + make_js_treefiles: cmd: - mkdir -p data/js_output - make_js_treefiles --outdir data/js_output diff --git a/oz_tree_build/README.markdown b/oz_tree_build/README.markdown index 43eb77e..4f00d8d 100755 --- a/oz_tree_build/README.markdown +++ b/oz_tree_build/README.markdown @@ -45,168 +45,6 @@ Then see the section titled "Upload data to the server and check it" below. Edit `params.yaml` to change the OpenTree version, taxonomy version, build version, etc. DVC will detect the parameter changes and re-run only the affected stages. -## Manual steps (without DVC) - -The following manual instructions are preserved for reference. They document the same steps that the DVC pipeline automates. - -### Environment - -The following environment variables should be set: - -``` -OZ_TREE=AllLife # a tree directory in data/OZTreeBuild -OZ_DIR=../OZtree # the path to the OneZoom/OZtree github directory -``` - -You also need to select the OpenTree version to build against. -You can discover the most recent version of both the synthetic tree (`synth_id`) and the taxonomy (`taxonomy_version`) via the -[API](https://github.com/OpenTreeOfLife/germinator/wiki/Open-Tree-of-Life-Web-APIs): - -```bash -$ curl -s -X POST https://api.opentreeoflife.org/v3/tree_of_life/about | grep -E '"synth_id"|"taxonomy_version"' - "synth_id": "opentree15.1", - "taxonomy_version": "3.7draft2" -``` - -You should then set these as environment variables: - -``` -OT_VERSION=15.1 #or whatever your OpenTree version is -OT_TAXONOMY_VERSION=3.7 -OT_TAXONOMY_EXTRA=draft2 #optional - the draft for this version, e.g. `draft1` if the taxonomy_version is 3.6draft1 -``` - -### Downloads - -Follow the [the download instructions](../data/README.markdown) to fetch required files. In summary, this should entail: - -``` -## Open Tree of Life -wget -cP data/OpenTree/ "https://files.opentreeoflife.org/synthesis/opentree${OT_VERSION}/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" -wget -cP data/OpenTree/ "https://files.opentreeoflife.org/ott/ott${OT_TAXONOMY_VERSION}/ott${OT_TAXONOMY_VERSION}.tgz" - -## Wikimedia -wget -cP data/Wiki/wp_SQL/ https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-page.sql.gz -wget -cP data/Wiki/wd_JSON/ https://dumps.wikimedia.org/wikidatawiki/entities/latest-all.json.bz2 - -## Pageviews -wget -cP data/Wiki/wp_pagecounts/ 'https://dumps.wikimedia.org/other/pageview_complete/monthly/2024/2024-03/pageviews-202403-user.bz2' - -## EoL -wget -cP data/EOL/ https://eol.org/data/provider_ids.csv.gz -``` - -Note that as documented in that readme, -you will also need to create a `draftversionXXX.tre` file containing no `mrca` strings: - -``` -perl -pe 's/\)mrcaott\d+ott\d+/\)/g; s/[ _]+/_/g;' \ - data/OpenTree/labelled_supertree_simplified_ottnames.tre \ - > data/OpenTree/draftversion${OT_VERSION}.tre -``` - -### Building a tree - -The times given at the start of each of the following steps refer to the time taken to run the commands on the entire tree of life. - -If you already have your own newick tree with open tree ids on it already, and don't want to graft extra clades from the OpenTree, you can skip steps 1-4, and simply save the tree as `${OZ_TREE}_full_tree.phy` in your base directory. If you have a tree but it does not have ott numbers, then you can add them using step 1, and move the resulting tree in `BespokeTree/include_files` to `${OZ_TREE}_full_tree.phy` in your base directory. - -### Create the tree - -0. The following steps assume the venv has been activated: - - ``` - . .venv/bin/activate - ``` - - If not created, see installation steps in the [main README](../README.markdown). - -1. (20 secs) Use the [OpenTree API](https://github.com/OpenTreeOfLife/germinator/wiki/Synthetic-tree-API-v3) to add OTT ids to any non-opentree taxa in our own bespoke phylogenies (those in `*.phy` or `*.PHY` files). The new `.phy` and `.PHY` files will be created in a new directory within `data/OZTreeBuild/${OZ_TREE}/BespokeTree`, and a symlink to that directory will be created called `include_files` - - ``` - mkdir -p "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}" - touch "data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/dir" - rm data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA}/* && \ - add_ott_numbers_to_trees \ - --savein data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_OTT${OT_TAXONOMY_VERSION}${OT_TAXONOMY_EXTRA} \ - data/OZTreeBuild/${OZ_TREE}/BespokeTree/include_noAutoOTT/*.[pP][hH][yY] - ``` - -1. Copy supplementary OpenTree-like newick files (if any) to the `OpenTree_all` directory. These are clades referenced in the OneZoom phylogeny that are missing from the OpenTree, and whose subtrees thus need to be supplied by hand. If any are required, they should be placed in the `OT_required` directory within `data/OZTreeBuild/${OZ_TREE}`. For tree building, they should be copied into the directory containing OpenTree subtrees using - - ``` - (cd data/OZTreeBuild/${OZ_TREE}/OpenTreeParts && \ - cp -n OT_required/*.nwk OpenTree_all/) - ``` - - If you do not have any supplementary `.nwk` subtrees in the `OT_required` directory, this step will output a warning, which can be ignored. - -1. (a few secs) Construct OpenTree subtrees for inclusion from the `draftversion${OT_VERSION}.tre` file. The subtrees to be extracted are specified by inclusion strings in the `.PHY` files created in step 1. The command for this is `getOpenTreesFromOneZoom.py`, and it needs to be run from within the `data/OZTreeBuild/${OZ_TREE}` directory, as follows: - - ``` - (cd data/OZTreeBuild/${OZ_TREE} && get_open_trees_from_one_zoom \ - ../../OpenTree/draftversion${OT_VERSION}.tre OpenTreeParts/OpenTree_all/ \ - BespokeTree/include_files/*.PHY) - ``` - - If you are not including any OpenTree subtrees in your final tree, you should have no `.PHY` files, and this step will output a warning, which can be ignored. - -1. (1 sec) substitute these subtrees into the main tree, and save the resulting full newick file using the `build_oz_tree` script: - - ``` - (cd data/OZTreeBuild/${OZ_TREE} && \ - build_oz_tree BespokeTree/include_files/Base.PHY OpenTreeParts/OpenTree_all/ AllLife_full_tree.phy) - ``` - - Now that we are not having to run this every sponsorship time, we should probably re-write this to actually know what tree structure looks like, maybe using Python/DendroPy (see https://github.com/jrosindell/OneZoomComplete/issues/340) and also to automatically create the list of DOIs at `${OZ_DIR}/static/FinalOutputs/refs.txt`. Note that any '@' signs in the `${OZ_TREE}_full_tree.phy` output file are indicative of OpenTree substitutions that have not been possible: it would be good to check to see if there are other sources (or old OpenTree versions) that have trees for these nodes, and place them as .phy files in `data/OZTreeBuild/${OZ_TREE}/OpenTreeParts/OT_required/`. You can check with - - ``` - grep -o '.............@' data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy - ``` - - You may also want to save a zipped version of the full tree file in a place where users can download it for reference purposes, in which case you can do - - ``` - gzip < data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy > ${OZ_DIR}/static/FinalOutputs/${OZ_TREE}_full_tree.phy.gz - ``` - -### Create the base tree and table data - -5. (5 to 7 hours, or a few mins if files are already filtered) This generates filtered versions of the raw input files, which then makes them faster to work with. In the DVC pipeline, this is handled by the `filter_eol`, `filter_wikidata`, `filter_sql`, and `filter_pageviews` stages, which run as separate parallel stages. Without DVC, the `generate_filtered_files` script can still be used to run them all together: - - ``` - tar -C data/OpenTree -zxvf data/OpenTree/ott${OT_TAXONOMY_VERSION}.tgz - (cd data && generate_filtered_files OZTreeBuild/AllLife/AllLife_full_tree.phy OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv EOL/provider_ids.csv.gz Wiki/wd_JSON/latest-all.json.bz2 Wiki/wp_SQL/enwiki-page.sql.gz Wiki/wp_pagecounts/pageviews*.bz2) - ``` - -1. (11 mins) On the basis of the `${OZ_TREE}_full_tree.phy` file, look for ID mappings between different datasets, calculate popularity measures via wikidata/pedia, refine the tree (remove subspecies, randomly break polytomies, remove unifurcations etc), and then create corresponding database tables together with `ordered_tree_XXXXX.nwk`, `ordered_tree_XXXXX.poly` (same file but with polytomies marked with curly braces), and `ordered_dates_XXXXX.js` files (where XXXXX is the version number, usually a timestamp). - - Additional flags can be given to override the OpenTree taxonomy in specific cases (using `--extra_source_file`), and to exclude certain taxa (e.g. dinosaurs) from the popularity calculations. - - If you do not have comprehensive tree of a clade, it probably doesn't make sense to calculate popularity measures, and you can run this script with the `-p` flag (or omit the references to the `wp_` wikipedia files). - - ``` - CSV_base_table_creator \ - data/OZTreeBuild/${OZ_TREE}/${OZ_TREE}_full_tree.phy \ - data/OpenTree/ott${OT_TAXONOMY_VERSION}/taxonomy.tsv \ - data/EOL/OneZoom_provider_ids.csv \ - data/Wiki/wd_JSON/OneZoom_latest-all.json \ - data/Wiki/wp_SQL/OneZoom_enwiki-latest-page.sql \ - data/Wiki/wp_pagecounts/OneZoom_pageviews* \ - -o data/output_files -v \ - --exclude Archosauria_ott335588 Dinosauria_ott90215 \ - --extra_source_file data/OZTreeBuild/${OZ_TREE}/BespokeTree/SupplementaryTaxonomy.tsv \ - 2> data/output_files/ordered_output.log - ``` - - Since round braces, curly braces, and commas are banned from the `simplified_ottnames` file, we can create minimal topology files by simply removing everything except these characters from the `.nwk` and `.poly` files. If the tree has been ladderised, with polytomies and unifurcations removed, the commas are also redundant, and can be removed. This is done in the next step, which saves these highly shortened strings into .js data files. - -1. (1 min) Turn the most recently saved tree files (saved in the previous step as `data/output_files/ordered_tree_XXXXXX.poly` and `ordered_dates_XXXXXX.json`) into bracketed newick JS files. In the DVC pipeline, these are output to `data/js_output/` and can be copied to the OZtree repo. Without DVC, you can write directly to the OZtree directory: - - ``` - make_js_treefiles --outdir ${OZ_DIR}/static/FinalOutputs/data - ``` - ### Upload data to the server and check it 8. If you are running the tree building scripts on a different computer to the one running the web server, you will need to push the `completetree_XXXXXX.js`, `completetree_XXXXXX.js.gz`, `cut_position_map_XXXXXX.js`, `cut_position_map_XXXXXX.js.gz`, `dates_XXXXXX.js`, `dates_XXXXXX.js.gz` files onto your server, e.g. by pushing to your local Github repo then pulling the latest github changes to the server. diff --git a/oz_tree_build/newick/newick_parser.py b/oz_tree_build/newick/newick_parser.py index 0bc7801..b9bfa85 100644 --- a/oz_tree_build/newick/newick_parser.py +++ b/oz_tree_build/newick/newick_parser.py @@ -6,7 +6,7 @@ Nodes are returned in post-order (children before parent), which is the Newick order. -For simplicity, it assumes that the tree string has no spaces. +OTTs are assumed to be in the form ott123, separated by node-name with either a space or underscore. Here is a trivial example of how to use it: @@ -28,6 +28,7 @@ __author__ = "David Ebbo" non_name_regex = re.compile(r"[,;:\(\)]") +split_ott_regex = re.compile(r"^(.*)[_ ]ott(\d+)$") def parse_tree(newick_tree): @@ -89,10 +90,10 @@ def raise_syntax_error(message): if taxon: # Check if the taxon has an ott id, and if so, parse it out - if "_ott" in taxon: - ott_index = taxon.index("_ott") - ott = taxon[ott_index + 4 :] - taxon = taxon[:ott_index] + m = split_ott_regex.match(taxon) + if m: + ott = m.group(2) + taxon = m.group(1) yield { "taxon": taxon, diff --git a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py index 3891168..f968f7e 100755 --- a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py +++ b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py @@ -131,18 +131,10 @@ def get_OTT_species(taxonomy_filename): return species_list -def get_tree_and_OTT_list(tree_filename, sources): +def parse_tree(tree_filename): """ - Takes a base tree and creates objects for each node and leaf, attaching them as 'data' - dictionaries to each node in the DendroPy tree. Nodes and leaves with an OTT id also - have pointers to their data dicts stored in an OTT-keyed dict, so that mappings to other - databases (ncbi id, etc etc) can be created. - - We want to allow duplicate leaf names, so for the entire procedure we ignore the Dendropy - concept of a taxon list and simply use labels. Returns the Dendropy tree and the OTT dict. + Parses (tree_filename) and returns the DendroPy tree object """ - indexed_by_ott = {} - try: tree = Tree.get_from_path( tree_filename, @@ -150,10 +142,24 @@ def get_tree_and_OTT_list(tree_filename, sources): preserve_underscores=True, suppress_leaf_node_taxa=True, ) + return tree except Exception as e: sys.exit("Problem reading tree from " + tree_filename + ": " + str(e)) logging.info(" > read tree from " + tree_filename) + +def get_OTT_list(tree, sources): + """ + Takes a base tree and creates objects for each node and leaf, attaching them as 'data' + dictionaries to each node in the DendroPy tree. Nodes and leaves with an OTT id also + have pointers to their data dicts stored in an OTT-keyed dict, so that mappings to other + databases (ncbi id, etc etc) can be created. + + We want to allow duplicate leaf names, so for the entire procedure we ignore the Dendropy + concept of a taxon list and simply use labels. Returns the Dendropy tree and the OTT dict. + """ + indexed_by_ott = {} + ott_node = re.compile(r"(.*) ott(\d+)(@\d*)?$") # matches the OTT number mrca_ott_node = re.compile( r"(.*) (mrcaott\d+ott\d+)(@\d*)?$" @@ -161,7 +167,7 @@ def get_tree_and_OTT_list(tree_filename, sources): tot = 0 for node in tree.preorder_node_iter(): tot += 1 - node.data = {"parent": node.parent_node or None} + node.data = {} if node.label: node.label = node.label.replace("_", " ") m = ott_node.search(node.label) @@ -209,7 +215,7 @@ def get_tree_and_OTT_list(tree_filename, sources): f"✔ extracted {len(indexed_by_ott)} otts from {tot} leaves & nodes. " f"Mem usage {OTT_popularity_mapping.mem():.1f} Mb" ) - return tree, indexed_by_ott + return indexed_by_ott def add_eol_IDs_from_EOL_table_dump(source_ptrs, identifiers_filename, source_mapping): @@ -958,7 +964,8 @@ def process_all(args): # the ids for these sources may not be numbers (e.g. Silva has things like D11377/#1 logging.info("> Creating tree structure") - tree, OTT_ptrs = get_tree_and_OTT_list(args.Tree, sources) + tree = parse_tree(args.Tree) + OTT_ptrs = get_OTT_list(tree, sources) logging.info("> Adding source IDs") source_ptrs = OTT_popularity_mapping.create_from_taxonomy( diff --git a/oz_tree_build/tree_build/build_oz_tree.py b/oz_tree_build/tree_build/build_oz_tree.py index b113196..ddb41dc 100644 --- a/oz_tree_build/tree_build/build_oz_tree.py +++ b/oz_tree_build/tree_build/build_oz_tree.py @@ -12,7 +12,6 @@ from ..utilities.debug_util import parse_args_and_add_logging_switch from .oz_tokens import enumerate_one_zoom_tokens -from .token_to_oz_tree_file_mapping import token_to_file_map __author__ = "David Ebbo" @@ -44,7 +43,8 @@ def process_newick( file, node_name_in_parent=None, edge_length_in_parent=None, - mapping_entry=None, + override_edge_length=None, + override_taxon=None, expand_nodes=False, ): """ @@ -56,10 +56,7 @@ def process_newick( # If we're printing the file tree, print the current file if print_file_tree and expand_nodes: - print( - f"{' ' * depth}{node_name_in_parent}: {edge_length_in_parent} " - f"{mapping_entry['edge_length'] if mapping_entry else 0}" - ) + print(f"{' ' * depth}{node_name_in_parent}: {edge_length_in_parent} {override_edge_length or 0}") if not os.path.exists(file): logging.warning(f"Subtree file {file} does not exist") @@ -73,34 +70,25 @@ def process_newick( # We only need to look for children if it's a OneZoom file (i.e. .PHY extension) if expand_nodes: - for result in enumerate_one_zoom_tokens(tree): + for result in enumerate_one_zoom_tokens( + tree, + dict( + ot=ot_parts_folder, + oz=oz_parts_folder, + ot_required=os.path.join(os.path.dirname(os.path.dirname(ot_parts_folder)), "OT_required"), + ), + ): # Write the part of the tree before the child output_stream.write(tree[index : result["start"]]) - child_full_name = result["full_name"] - - # Check if OZ token has a base ott (e.g. 123 in foobar_ott123~456-789) - if "base_ott" in result: - # It's an extracted Open Tree file, e.g. 123.phy - sub_file = os.path.join(ot_parts_folder, f'{result["base_ott"]}.phy') - if not os.path.exists(sub_file): - # Fall back to .nwk, which happens for additional copied files - sub_file = os.path.join(ot_parts_folder, f'{result["base_ott"]}.nwk') - expand_child_nodes = False - child_mapping_entry = None - else: - # Otherwise, it's a OneZoom file, e.g. AMORPHEA@ --> Amorphea.PHY - child_mapping_entry = token_to_file_map[child_full_name] - sub_file = os.path.join(oz_parts_folder, child_mapping_entry["file"]) - expand_child_nodes = True - depth += 1 if process_newick( - sub_file, - child_full_name, - result["edge_length"], - child_mapping_entry, - expand_child_nodes, + file=result["file"], + node_name_in_parent=result["node_name_in_parent"], + edge_length_in_parent=result["edge_length_in_parent"], + override_edge_length=result["override_edge_length"], + override_taxon=result["override_taxon"], + expand_nodes=result["expand_nodes"], ): index = result["end"] else: @@ -125,12 +113,11 @@ def process_newick( # Always favor the length from our mapping, falling back to the last token in the file # Note that we never fall back to edge_length_in_parent here, following old code logic # DISCUSS: should we? - edge_length = mapping_entry["edge_length"] if mapping_entry else None - edge_length = edge_length or last_token_edge_length + edge_length = override_edge_length or last_token_edge_length - if mapping_entry: + if expand_nodes: # Three levels of fallback for .PHY files: mapping, last token, parent - node_name = mapping_entry["taxon"] or last_token_name or node_name_in_parent + node_name = override_taxon or last_token_name or node_name_in_parent else: # NB: following old code logic, the above parent vs last logic is reversed here # DISCUSS: is there a logical reason for this? diff --git a/oz_tree_build/tree_build/get_open_trees_from_one_zoom.py b/oz_tree_build/tree_build/get_open_trees_from_one_zoom.py index 9c664bb..9dc4266 100644 --- a/oz_tree_build/tree_build/get_open_trees_from_one_zoom.py +++ b/oz_tree_build/tree_build/get_open_trees_from_one_zoom.py @@ -53,7 +53,7 @@ def get_inclusions_and_exclusions_from_one_zoom_file(file, all_included_otts, al for result in enumerate_one_zoom_tokens(tree): # Check if the result has a base ott (won't have it if it's inserting another OZ file) - if "base_ott" in result: + if result.get("base_ott") is not None: all_included_otts.add(result["base_ott"]) all_excluded_otts.update(result["excluded_otts"]) @@ -72,6 +72,7 @@ def extract_trees_from_open_tree_file(open_tree_file, output_dir, all_included_o logging.info(f"Extracted {len(trees)} trees from Open Tree file") # Save each tree to a file named after the taxon + os.makedirs(output_dir, exist_ok=True) for ott, tree in trees.items(): file = os.path.join(output_dir, ott + ".phy") logging.debug(f"Writing file: {file}") diff --git a/oz_tree_build/tree_build/ott_mapping/add_ott_numbers_to_trees.py b/oz_tree_build/tree_build/ott_mapping/add_ott_numbers_to_trees.py index e49560e..cd23e25 100755 --- a/oz_tree_build/tree_build/ott_mapping/add_ott_numbers_to_trees.py +++ b/oz_tree_build/tree_build/ott_mapping/add_ott_numbers_to_trees.py @@ -54,8 +54,7 @@ def main(): ) parser.add_argument( "--output_info", - default="info_about_matches.txt", - help="file in --savein to save info about matches. If no --savein, print to stdout", + help="file to save info about matches. If not provided, print to stdout", ) parser.add_argument( "--symlink", @@ -83,8 +82,8 @@ def main(): f"Could not create dir to save files {args.savein}: " f"I/O error({e.errno}): {e.strerror}", file=sys.stderr, ) - if args.output_info and args.output_info != "": - outinfo = open(os.path.join(args.savein, args.output_info), mode="w") + if args.output_info: + outinfo = open(args.output_info, mode="w") def lookup_OTT(name_node_dict, context): """ diff --git a/oz_tree_build/tree_build/oz_tokens.py b/oz_tree_build/tree_build/oz_tokens.py index a5d8893..65242ae 100644 --- a/oz_tree_build/tree_build/oz_tokens.py +++ b/oz_tree_build/tree_build/oz_tokens.py @@ -1,18 +1,51 @@ __author__ = "David Ebbo" import logging +import os.path import re +from .token_to_oz_tree_file_mapping import token_to_file_map + __author__ = "David Ebbo" full_ott_token = re.compile(r"'?([\w\-~]+)@'?(?::([\d\.]+))?") ott_details = re.compile(r"(\w+)_ott(\d*)~?([-\d]*)$") -def enumerate_one_zoom_tokens(tree): +def parse_one_zoom_token(node_label, parts_folders=None): + """ + Parse a single OneZoom token from label name + """ + if parts_folders is None: + parts_folders = {} + + if not node_label: + return None + try: + return next(enumerate_one_zoom_tokens(node_label, parts_folders)) + except StopIteration: + return None + + +def enumerate_one_zoom_tokens(tree, parts_folders=None): """ Enumerates all the OneZoom tokens in a tree string (e.g. foobar_ott123~-789-111) + + Yields dicts with the keys: + + - start: Position in in string the match was found + - end: End of match + - node_name_in_parent: Node name from inclusion node, ignoring OZ inclusion syntax + - edge_length_in_parent: Edge length from inclusion node + - file: File path pointing to tree to substitute + - base_ott: OTT of root, if subtree is a OT tree + - excluded_otts: OTTs to exclude from subtree (as strings not ints) + - expand_nodes: Should we recurse and apply OZ inclusion rules to subtree? + - override_edge_length: Replace edge length from root node with this value + - override_taxon: Replace root node name with this value """ + if parts_folders is None: + parts_folders = {} # Skip the comment block at the start of the file start_index = tree.index("]") if "[" in tree else 0 @@ -21,24 +54,51 @@ def enumerate_one_zoom_tokens(tree): result = { "start": full_match.start(), "end": full_match.end(), - "full_name": full_match.group(1), - "edge_length": float(full_match.group(2)) if full_match.group(2) else None, + "node_name_in_parent": full_match.group(1), + "edge_length_in_parent": float(full_match.group(2)) if full_match.group(2) else None, } # Check if it matches our tilde (aka 'equal') exclusion syntax - match = ott_details.match(result["full_name"]) + match = ott_details.match(result["node_name_in_parent"]) + base_ott = None if match: # split by minus signs result["excluded_otts"] = (match.group(3) or "").split("-") # If present, the first number after '=' is the tree to extract. first_number_after_equal = result["excluded_otts"].pop(0) - result["base_ott"] = first_number_after_equal or match.group(2) + base_ott = first_number_after_equal or match.group(2) # Note that we don't append the ott in the name if it came after the '=' - result["full_name"] = match.group(1) + result["node_name_in_parent"] = match.group(1) if not first_number_after_equal: - result["full_name"] += f"_ott{result['base_ott']}" + result["node_name_in_parent"] += f"_ott{base_ott}" + + # Check if OZ token has a base ott (e.g. 123 in foobar_ott123~456-789) + if base_ott is not None: + # It's an extracted Open Tree file, e.g. 123.phy + # NB: We can't make a valid path without parts_folder["ot"], but we probably don't care in this case + result["base_ott"] = base_ott + if os.path.exists( + os.path.join(parts_folders.get("ot_required", "/unconfiguredpath/ot_requried/"), f"{base_ott}.nwk") + ): + # An ot_required orphan OT file exists, use that + result["file"] = os.path.join( + parts_folders.get("ot_required", "/unconfiguredpath/ot_requried/"), f"{base_ott}.nwk" + ) + else: + result["file"] = os.path.join(parts_folders.get("ot") or ".", f"{base_ott}.phy") + result["override_edge_length"] = None + result["override_taxon"] = None + result["expand_nodes"] = False + else: + # Otherwise, it's a OneZoom file, e.g. AMORPHEA@ --> Amorphea.PHY + child_mapping_entry = token_to_file_map[result["node_name_in_parent"]] + result["base_ott"] = None + result["file"] = os.path.join(parts_folders.get("oz") or ".", child_mapping_entry["file"]) + result["override_edge_length"] = child_mapping_entry.get("edge_length", None) + result["override_taxon"] = child_mapping_entry.get("taxon", None) + result["expand_nodes"] = True logging.debug(result) yield result diff --git a/oz_tree_build/utilities/download_opentree.py b/oz_tree_build/utilities/download_opentree.py index e1ededd..4105a90 100644 --- a/oz_tree_build/utilities/download_opentree.py +++ b/oz_tree_build/utilities/download_opentree.py @@ -21,13 +21,14 @@ import requests +OT_SSL_VERIFY = not os.environ.get("OT_SSL_VERIFY_DISABLE") SYNTHESIS_JSON_URL = ( "https://raw.githubusercontent.com/OpenTreeOfLife/opentree" "/master/webapp/static/statistics/synthesis.json" ) def fetch_synthesis_json(): - response = requests.get(SYNTHESIS_JSON_URL) + response = requests.get(SYNTHESIS_JSON_URL, verify=OT_SSL_VERIFY) response.raise_for_status() return response.json() @@ -58,7 +59,7 @@ def download_tree(version, output_dir): f"/output/labelled_supertree/labelled_supertree_simplified_ottnames.tre" ) print(f"Downloading tree from {tree_url} ...") - response = requests.get(tree_url) + response = requests.get(tree_url, verify=OT_SSL_VERIFY) response.raise_for_status() raw_path = os.path.join(output_dir, "labelled_supertree_simplified_ottnames.tre") @@ -78,7 +79,7 @@ def download_taxonomy(ott_version_raw, output_dir): ott_version = ott_version_raw.split("draft")[0] taxonomy_url = f"https://files.opentreeoflife.org/ott/{ott_version}/{ott_version}.tgz" print(f"Downloading taxonomy from {taxonomy_url} ...") - response = requests.get(taxonomy_url) + response = requests.get(taxonomy_url, verify=OT_SSL_VERIFY) response.raise_for_status() with tempfile.TemporaryDirectory() as tmpdir: diff --git a/pyproject.toml b/pyproject.toml index 2d011d2..2138c83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,11 +31,6 @@ dev = [ "pre-commit>=4.5.1", ] -[tool.pytest.ini_options] -markers = [ - "skip_real_apis: Mark test to be skipped if real APIs are not available", -] - [project.scripts] add_ott_numbers_to_trees = "oz_tree_build.tree_build.ott_mapping.add_ott_numbers_to_trees:main" build_oz_tree = "oz_tree_build.tree_build.build_oz_tree:main" @@ -70,4 +65,7 @@ packages = ["oz_tree_build"] include-package-data = true [tool.pytest] -testpaths = ["tests"] \ No newline at end of file +testpaths = ["tests"] +markers = [ + "skip_real_apis: Mark test to be skipped if real APIs are not available", +] diff --git a/tests/test_newick_parser.py b/tests/test_newick_parser.py index c4c7286..2b20347 100644 --- a/tests/test_newick_parser.py +++ b/tests/test_newick_parser.py @@ -41,6 +41,43 @@ def test_full_parse_result(): ] +def test_ott_spaces(): + node_list = list(parse_tree("(A ott123,B ott5 om ott456:5.9)C;")) + assert node_list == [ + { + "depth": 1, + "edge_length": 0.0, + "end": 9, + "full_name_start_index": 1, + "is_leaf": True, + "ott": "123", + "start": 1, + "taxon": "A", + }, + { + "depth": 1, + "edge_length": 5.9, + "end": 30, + "full_name_start_index": 10, + "is_leaf": True, + "ott": "456", + "start": 10, + # NB: mid-string ott doesn't throw us off + "taxon": "B ott5 om", + }, + { + "depth": 0, + "edge_length": 0.0, + "end": 32, + "full_name_start_index": 31, + "is_leaf": False, + "ott": None, + "start": 0, + "taxon": "C", + }, + ] + + def test_quoted_taxa(): node_list = list(parse_tree("('Abc/def_ott123','qw e$r&ty':1.2)'C_*(ot)t789_ott987':5.5;")) assert node_list[0]["taxon"] == "Abc/def"