diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8af20c95b..70966031e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,8 +12,8 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: prefix-dev/setup-pixi@v0.8.1 + - uses: prefix-dev/setup-pixi@v0.9.6 with: - pixi-version: v0.37.0 + pixi-version: v0.66.0 cache: true - run: pixi run test diff --git a/CHANGELOG.md b/CHANGELOG.md index 5769cd383..2b50f8c5c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,10 @@ # Change Log All notable changes to this project will be documented in this file. +## v0.2.0 +- Update fibertools-rs version (also samtools, htslib, and bedtools) in `workflow/envs/env.yaml` +- Fix Polars issue in #52 + ## v0.1.2 diff --git a/pixi.toml b/pixi.toml index 431ab05e7..db2350554 100644 --- a/pixi.toml +++ b/pixi.toml @@ -1,41 +1,16 @@ -[project] +[workspace] authors = ["Mitchell Robert Vollger "] channels = ["conda-forge", "bioconda"] -description = "Add a short description here" +description = "A Snakemake pipeline for calling FIRE peaks using fibertools-rs." name = "FIRE" platforms = ["osx-64", "linux-64"] -version = "0.1.2" +version = "0.2.0" [tasks] fmt = "ruff format . && taplo format pixi.toml && snakefmt workflow/" -test-data = { cmd = [ - "cd", - "$INIT_CWD", - "&&", - "mkdir", - "-p", - "fire-test-data", - "&&", - "aws", - "s3", - "--no-sign-request", - "sync", - "--endpoint-url", - "https://s3.kopah.orci.washington.edu", - "s3://stergachis/public/FIRE/test-data", - "fire-test-data/", -] } -test = { cmd = [ - "cd", - "$INIT_CWD/fire-test-data", - "&&", - "snakemake", - "-s", - "$PIXI_PROJECT_ROOT/workflow/Snakefile", - "--configfile", - "test.yaml", - "-k", -], depends-on = [ +test-data = { cmd = '''bash -c 'if [ -f "$INIT_CWD/fire-test-data/test.cram" ]; then echo "test data already present, skipping download"; else mkdir -p "$INIT_CWD/fire-test-data" && aws s3 --no-sign-request sync --endpoint-url https://s3.kopah.orci.washington.edu s3://stergachis/public/FIRE/test-data "$INIT_CWD/fire-test-data/"; fi' ''' } +test-clean = { cmd = '''bash -c 'cd "$INIT_CWD/fire-test-data" && rm -rf results temp .snakemake' ''' } +test = { cmd = '''bash -c 'cd "$INIT_CWD/fire-test-data" && trap "rm -rf results temp .snakemake" EXIT && snakemake -s "$PIXI_PROJECT_ROOT/workflow/Snakefile" --configfile test.yaml -k' ''', depends-on = [ "test-data", ], clean-env = true } fire = { cmd = [ diff --git a/workflow/envs/env.yaml b/workflow/envs/env.yaml index 07984df4c..27e2d5ab2 100644 --- a/workflow/envs/env.yaml +++ b/workflow/envs/env.yaml @@ -4,10 +4,10 @@ channels: - bioconda - defaults dependencies: - - samtools==1.19.1 - - htslib==1.19.1 - - bedtools==2.31 - - bioconda::fibertools-rs==0.6 + - samtools>=1.19.1 + - htslib>=1.19.1 + - bedtools>=2.31 + - bioconda::fibertools-rs==0.9 - hck>=0.9.2 - bioawk - ripgrep diff --git a/workflow/rules/coverages.smk b/workflow/rules/coverages.smk index c354c76cc..ad388b0e0 100644 --- a/workflow/rules/coverages.smk +++ b/workflow/rules/coverages.smk @@ -126,13 +126,14 @@ rule exclude_from_shuffle: conda: DEFAULT_ENV params: - exclude=EXCLUDES, + exclude=lambda wc: " ".join(EXCLUDES) if EXCLUDES else "", shell: """ - - ( \ - bedtools genomecov -bga -i {input.filtered} -g {input.fai} | awk '$4 == 0'; \ - less {params.exclude} \ + ( + bedtools genomecov -bga -i {input.filtered} -g {input.fai} | awk '$4 == 0' + if [ -n "{params.exclude}" ]; then + zcat -f {params.exclude} + fi ) \ | cut -f 1-3 \ | bedtools sort \ diff --git a/workflow/rules/fire-peaks.smk b/workflow/rules/fire-peaks.smk index f328534ab..b3a464841 100644 --- a/workflow/rules/fire-peaks.smk +++ b/workflow/rules/fire-peaks.smk @@ -34,7 +34,7 @@ rule shuffled_pileup_chromosome: DEFAULT_ENV shell: """ - {FT_EXE} pileup {input.cram} {wildcards.chrom} -t {threads} \ + {FT_EXE} pileup {input.cram} -r {wildcards.chrom} -t {threads} \ --fiber-coverage --shuffle {input.shuffled} \ --no-msp --no-nuc \ | bgzip -@ {threads} \ @@ -103,7 +103,7 @@ rule pileup_chromosome: """ {FT_EXE} pileup -t {threads} \ --haps --fiber-coverage \ - {input.bam} {wildcards.chrom} \ + {input.bam} -r {wildcards.chrom} \ | bgzip -@ {threads} \ > {output.bed} """ diff --git a/workflow/scripts/fdr-table.py b/workflow/scripts/fdr-table.py index c78d9fb53..3c185ead3 100644 --- a/workflow/scripts/fdr-table.py +++ b/workflow/scripts/fdr-table.py @@ -57,11 +57,16 @@ def read_pileup_file(infile, nrows): return None # add scema overrides for the score columns + # Build schema overrides keyed by positional column names (column_1, column_2, ...) + # because polars infers schema BEFORE new_columns is applied when has_header=False. + # Keying on '#chrom' / 'score' here would be silently ignored. schema_overrides = {} - for n in ["score", "score_H1", "score_H2", "score_shuffled"]: - if n in header: - schema_overrides[n] = float - + for col_idx, col_name in enumerate(header, start=1): + positional = f"column_{col_idx}" + if col_name in ("score", "score_H1", "score_H2", "score_shuffled"): + schema_overrides[positional] = pl.Float64 + elif col_name == "#chrom": + schema_overrides[positional] = pl.Utf8 logging.info(f"Header of the pileup file:\n{header}") logging.info(f"Schema overrides for the pileup file:\n{schema_overrides}") diff --git a/workflow/scripts/merge_fire_peaks.py b/workflow/scripts/merge_fire_peaks.py index eaf0c33f5..8867cec37 100755 --- a/workflow/scripts/merge_fire_peaks.py +++ b/workflow/scripts/merge_fire_peaks.py @@ -124,7 +124,7 @@ def main( logger.setLevel(log_level) inf = io.StringIO(sys.stdin.read()) - df = pl.read_csv(inf, separator="\t", null_values=".") + df = pl.read_csv(inf, separator="\t", null_values=".", schema_overrides={"#chrom": pl.Utf8},) if df.shape[0] == 0: logging.info("No peaks to merge") return 0