From 167533fe88c98ba81d5c75a3bb42afc770036230 Mon Sep 17 00:00:00 2001 From: jianshu93 Date: Mon, 1 Jan 2024 01:06:01 -0500 Subject: [PATCH 1/2] add ANI correction option by learning alignment-based ANI --- src/cgi/include/computeCoreIdentity.hpp | 6 +++++- src/map/include/map_parameters.hpp | 3 ++- src/map/include/parseCmdArgs.hpp | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/cgi/include/computeCoreIdentity.hpp b/src/cgi/include/computeCoreIdentity.hpp index 07066a4..ed94832 100644 --- a/src/cgi/include/computeCoreIdentity.hpp +++ b/src/cgi/include/computeCoreIdentity.hpp @@ -289,7 +289,11 @@ namespace cgi currentResult.countSeq = std::distance(it, rangeEndIter); currentResult.totalQueryFragments = totalQueryFragments; currentResult.identity = sumIdentity/currentResult.countSeq; - + if(parameters.correct) + { + // linear correction of final identity values according to large scale orthoANI(ANI_u) computations + currentResult.identity = currentResult.identity * 41.00/36.00 - 500.00/36.00; + } CGI_ResultsVector.push_back(currentResult); //Advance the iterator it diff --git a/src/map/include/map_parameters.hpp b/src/map/include/map_parameters.hpp index 93a7c3c..8cf7720 100644 --- a/src/map/include/map_parameters.hpp +++ b/src/map/include/map_parameters.hpp @@ -34,7 +34,8 @@ namespace skch std::vector querySequences; //query sequence(s) std::string outFileName; //output file name bool reportAll; //Report all alignments if this is true - bool visualize; //Visualize the conserved regions of two genomes + bool visualize; + bool correct; //Visualize the conserved regions of two genomes bool matrixOutput; //report fastani results as lower triangular matrix float maxRatioDiff; //max Ratio for sanity check bool sanityCheck; // Sanity check for extreme Cases diff --git a/src/map/include/parseCmdArgs.hpp b/src/map/include/parseCmdArgs.hpp index b90175b..6a3b9d8 100644 --- a/src/map/include/parseCmdArgs.hpp +++ b/src/map/include/parseCmdArgs.hpp @@ -123,6 +123,7 @@ namespace skch parameters.p_value = 1e-03; parameters.percentageIdentity = 80; parameters.visualize = false; + parameters.correct = false; parameters.matrixOutput = false; parameters.referenceSize = 5000000; parameters.maxRatioDiff = 100.0; @@ -146,6 +147,7 @@ namespace skch auto minfraction_cmd = (clipp::option("--minFraction") & clipp::value("value", parameters.minFraction)) % "minimum fraction of genome that must be shared for trusting ANI. If reference and query genome size differ, smaller one among the two is considered. [default : 0.2]"; auto maxratio_cmd = (clipp::option("--maxRatioDiff") & clipp::value("value", parameters.maxRatioDiff)) % "maximum difference between (Total Ref. Length/Total Occ. Hashes) and (Total Ref. Length/Total No. Hashes). [default : 10.0]"; auto visualize_cmd = clipp::option("--visualize").set(parameters.visualize).doc("output mappings for visualization, can be enabled for single genome to single genome comparison only [disabled by default]"); + auto correct_cmd = clipp::option("--correct").set(parameters.correct).doc("correct final ANI values by learning alignment-based ANI values"); auto matrix_cmd = clipp::option("--matrix").set(parameters.matrixOutput).doc("also output ANI values as lower triangular matrix (format inspired from phylip). If enabled, you should expect an output file with .matrix extension [disabled by default]"); auto output_cmd = (clipp::option("-o", "--output") & clipp::value("value", parameters.outFileName)) % "output file name"; auto sanitycheck_cmd = clipp::option("-s", "--sanityCheck").set(parameters.sanityCheck).doc("run sanity check"); @@ -164,6 +166,7 @@ namespace skch minfraction_cmd, maxratio_cmd, visualize_cmd, + correct_cmd, matrix_cmd, output_cmd, sanitycheck_cmd, From c97fdfe37c3af05aa2bdf5e6e8910fda7524d5ca Mon Sep 17 00:00:00 2001 From: Jianshu_Zhao <38149286+jianshu93@users.noreply.github.com> Date: Tue, 2 Jan 2024 10:10:10 -0500 Subject: [PATCH 2/2] add correct ANI info to readme add correct ANI info --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5224ab6..096df70 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,10 @@ For above use case, REFERENCE\_LIST should be a file containing directory paths $ ./fastANI --ql [QUERY_LIST] --rl [REFERENCE_LIST] -o [OUTPUT_FILE] ``` Again, QUERY\_LIST and REFERENCE\_LIST are files containing paths to genomes, one per line. +* **Corrected ANI values.** the final ANI values can be corrected by learning from alignment-based ANI (e.g., orthoANI), expecially for ANI range 70% to 85%. +```sh +$ ./fastANI --ql [QUERY_LIST] --rl [REFERENCE_LIST] -o [OUTPUT_FILE] --correct +``` **Output format.** In all above use cases, OUTPUT\_FILE will contain tab delimited row(s) with query genome, reference genome, ANI value, count of bidirectional fragment mappings, and total query fragments. Alignment fraction (wrt. the query genome) is simply the ratio of mappings and total fragments. Optionally, users can also get a second `.matrix` file with identity values arranged in a [phylip-formatted lower triangular matrix](https://www.mothur.org/wiki/Phylip-formatted_distance_matrix) by supplying `--matrix` parameter. **NOTE:** No ANI output is reported for a genome pair if ANI value is much below 80%. Such case should be computed at [amino acid level](http://enve-omics.ce.gatech.edu/aai/).