From f2aa051b0558da2a0e0569cbfba25918adb9a7c6 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 1 Nov 2023 15:29:43 +0100 Subject: [PATCH 01/28] feat: no-std --- .idea/fst.iml | 14 ++++ .idea/modules.xml | 8 +++ .idea/vcs.xml | 6 ++ .idea/workspace.xml | 126 +++++++++++++++++++++++++++++++++++ Cargo.toml | 4 +- src/automaton/levenshtein.rs | 23 +++++-- src/bytes.rs | 7 +- src/error.rs | 24 ++++++- src/lib.rs | 13 +++- src/map.rs | 57 +++++++++++++++- src/raw/build.rs | 31 +++++++-- src/raw/counting_writer.rs | 3 + src/raw/error.rs | 30 ++++++++- src/raw/mod.rs | 52 +++++++++++++-- src/raw/node.rs | 16 ++++- src/raw/ops.rs | 28 +++++++- src/raw/registry.rs | 11 ++- src/raw/registry_minimal.rs | 6 +- src/set.rs | 47 ++++++++++++- 19 files changed, 468 insertions(+), 38 deletions(-) create mode 100644 .idea/fst.iml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 .idea/workspace.xml diff --git a/.idea/fst.iml b/.idea/fst.iml new file mode 100644 index 00000000..e4ade7cf --- /dev/null +++ b/.idea/fst.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..46ed0723 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..35eb1ddf --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml new file mode 100644 index 00000000..613a07ed --- /dev/null +++ b/.idea/workspace.xml @@ -0,0 +1,126 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + { + "associatedIndex": 6 +} + + + + + + + + + + + + + + + + + 1698689393828 + + + + + + \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ca497485..02d51418 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,8 +19,10 @@ members = ["bench", "fst-bin"] exclude = ["fst-levenshtein", "fst-regex"] [features] -default = [] +default = ["std"] levenshtein = ["utf8-ranges"] +std = ["alloc"] +alloc = [] [patch.crates-io] fst = { path = "." } diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs index 4e2c2390..63b0dbf0 100644 --- a/src/automaton/levenshtein.rs +++ b/src/automaton/levenshtein.rs @@ -1,7 +1,9 @@ -use std::cmp; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::fmt; +use core::cmp; +use core::fmt; +#[cfg(feature = "alloc")] +use alloc::collections::hash_map::Entry; +#[cfg(feature = "alloc")] +use alloc::collections::{HashMap, HashSet}; use utf8_ranges::{Utf8Range, Utf8Sequences}; @@ -34,7 +36,7 @@ impl fmt::Display for LevenshteinError { } } -impl std::error::Error for LevenshteinError {} +impl core::error::Error for LevenshteinError {} /// A Unicode aware Levenshtein automaton for running efficient fuzzy queries. /// @@ -92,11 +94,13 @@ impl std::error::Error for LevenshteinError {} /// /// This is important functionality, so one should count on this implementation /// being vastly improved in the future. +#[cfg(feature = "alloc")] pub struct Levenshtein { prog: DynamicLevenshtein, dfa: Dfa, } +#[cfg(feature = "alloc")] impl Levenshtein { /// Create a new Levenshtein query. /// @@ -109,6 +113,7 @@ impl Levenshtein { /// /// A `Levenshtein` value satisfies the `Automaton` trait, which means it /// can be used with the `search` method of any finite state transducer. + #[cfg(feature = "alloc")] pub fn new( query: &str, distance: u32, @@ -132,6 +137,7 @@ impl Levenshtein { /// /// A `Levenshtein` value satisfies the `Automaton` trait, which means it /// can be used with the `search` method of any finite state transducer. + #[cfg(feature = "alloc")] pub fn new_with_limit( query: &str, distance: u32, @@ -147,6 +153,7 @@ impl Levenshtein { } } +#[cfg(feature = "alloc")] impl fmt::Debug for Levenshtein { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( @@ -158,11 +165,13 @@ impl fmt::Debug for Levenshtein { } #[derive(Clone)] +#[cfg(feature = "alloc")] struct DynamicLevenshtein { query: String, dist: usize, } +#[cfg(feature = "alloc")] impl DynamicLevenshtein { fn start(&self) -> Vec { (0..self.query.chars().count() + 1).collect() @@ -190,6 +199,7 @@ impl DynamicLevenshtein { } } +#[cfg(feature = "alloc")] impl Automaton for Levenshtein { type State = Option; @@ -215,6 +225,7 @@ impl Automaton for Levenshtein { } #[derive(Debug)] +#[cfg(feature = "alloc")] struct Dfa { states: Vec, } @@ -237,12 +248,14 @@ impl fmt::Debug for State { } } +#[cfg(feature = "alloc")] struct DfaBuilder { dfa: Dfa, lev: DynamicLevenshtein, cache: HashMap, usize>, } +#[cfg(feature = "alloc")] impl DfaBuilder { fn new(lev: DynamicLevenshtein) -> DfaBuilder { DfaBuilder { diff --git a/src/bytes.rs b/src/bytes.rs index 35da4112..943feaa8 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,4 +1,5 @@ -use std::convert::TryInto; +use core::convert::TryInto; +#[cfg(feature = "std")] use std::io; /// Read a u32 in little endian format from the beginning of the given slice. @@ -30,6 +31,7 @@ pub fn write_u32_le(n: u32, slice: &mut [u8]) { /// Like write_u32_le, but to an io::Write implementation. If every byte could /// not be writen, then this returns an error. #[inline] +#[cfg(feature = "std")] pub fn io_write_u32_le(n: u32, mut wtr: W) -> io::Result<()> { let mut buf = [0; 4]; write_u32_le(n, &mut buf); @@ -55,6 +57,7 @@ pub fn write_u64_le(n: u64, slice: &mut [u8]) { /// Like write_u64_le, but to an io::Write implementation. If every byte could /// not be writen, then this returns an error. #[inline] +#[cfg(feature = "std")] pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { let mut buf = [0; 8]; write_u64_le(n, &mut buf); @@ -65,6 +68,7 @@ pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { /// and writes it to the given writer. The number of bytes written is returned /// on success. #[inline] +#[cfg(feature = "std")] pub fn pack_uint(wtr: W, n: u64) -> io::Result { let nbytes = pack_size(n); pack_uint_in(wtr, n, nbytes).map(|_| nbytes) @@ -76,6 +80,7 @@ pub fn pack_uint(wtr: W, n: u64) -> io::Result { /// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the /// smallest number of bytes that can store the integer given. #[inline] +#[cfg(feature = "std")] pub fn pack_uint_in( mut wtr: W, mut n: u64, diff --git a/src/error.rs b/src/error.rs index 7f2bd347..603d331e 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,11 +1,17 @@ -use std::fmt; +use core::fmt; +#[cfg(feature = "std")] use std::io; use crate::raw; /// A `Result` type alias for this crate's `Error` type. +#[cfg(feature = "std")] pub type Result = std::result::Result; +/// A `Result` type alias for this crate's `Error` type. +#[cfg(not(feature = "std"))] +pub type Result = core::result::Result; + /// An error that encapsulates all possible errors in this crate. #[derive(Debug)] pub enum Error { @@ -13,9 +19,11 @@ pub enum Error { /// transducer. Fst(raw::Error), /// An IO error that occurred while writing a finite state transducer. + #[cfg(feature = "std")] Io(io::Error), } +#[cfg(feature = "std")] impl From for Error { #[inline] fn from(err: io::Error) -> Error { @@ -34,15 +42,29 @@ impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { Error::Fst(_) => write!(f, "FST error"), + #[cfg(feature = "std")] Error::Io(_) => write!(f, "I/O error"), } } } +#[cfg(feature = "std")] impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match *self { Error::Fst(ref err) => Some(err), + #[cfg(feature = "std")] + Error::Io(ref err) => Some(err), + } + } +} + +#[cfg(not(feature = "std"))] +impl core::error::Error for Error { + fn source(&self) -> Option<&(dyn core::error::Error + 'static)> { + match *self { + Error::Fst(ref err) => Some(err), + #[cfg(feature = "std")] Error::Io(ref err) => Some(err), } } diff --git a/src/lib.rs b/src/lib.rs index 6bf974e4..684f4670 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -299,15 +299,24 @@ data structures found in the standard library, such as `BTreeSet` and `fst-bin/src/merge.rs` from the root of this crate's repository. */ +#![cfg_attr(not(feature = "std"), no_std)] #![deny(missing_docs)] +#![cfg_attr(not(feature = "std"), feature(error_in_core))] + +#[cfg(feature = "alloc")] +extern crate alloc; #[cfg(all(feature = "levenshtein", doctest))] doc_comment::doctest!("../README.md"); pub use crate::automaton::Automaton; pub use crate::error::{Error, Result}; -pub use crate::map::{Map, MapBuilder}; -pub use crate::set::{Set, SetBuilder}; +pub use crate::map::Map; +#[cfg(feature = "alloc")] +pub use crate::map::MapBuilder; +pub use crate::set::Set; +#[cfg(feature = "alloc")] +pub use crate::set::SetBuilder; pub use crate::stream::{IntoStreamer, Streamer}; mod bytes; diff --git a/src/map.rs b/src/map.rs index d2121e0e..be735ba8 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1,12 +1,20 @@ -use std::fmt; +#[cfg(feature = "alloc")] +use core::fmt; +#[cfg(feature = "std")] use std::io; -use std::iter::{self, FromIterator}; +#[cfg(feature = "alloc")] +use core::iter::{self, FromIterator}; +#[cfg(feature = "alloc")] use crate::automaton::{AlwaysMatch, Automaton}; use crate::raw; pub use crate::raw::IndexedValue; -use crate::stream::{IntoStreamer, Streamer}; +#[cfg(feature = "alloc")] +use crate::stream::IntoStreamer; +use crate::stream::Streamer; use crate::Result; +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, string::String}; /// Map is a lexicographically ordered map from byte strings to integers. /// @@ -54,6 +62,7 @@ use crate::Result; #[derive(Clone)] pub struct Map(raw::Fst); +#[cfg(feature = "alloc")] impl Map> { /// Create a `Map` from an iterator of lexicographically ordered byte /// strings and associated values. @@ -64,6 +73,7 @@ impl Map> { /// Note that this is a convenience function to build a map in memory. /// To build a map that streams to an arbitrary `io::Write`, use /// `MapBuilder`. + #[cfg(feature = "std")] pub fn from_iter(iter: I) -> Result>> where K: AsRef<[u8]>, @@ -167,6 +177,7 @@ impl> Map { /// ]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn stream(&self) -> Stream<'_> { Stream(self.0.stream()) } @@ -190,6 +201,7 @@ impl> Map { /// assert_eq!(keys, vec![b"a", b"b", b"c"]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn keys(&self) -> Keys<'_> { Keys(self.0.stream()) } @@ -214,6 +226,7 @@ impl> Map { /// assert_eq!(values, vec![1, 2, 3]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn values(&self) -> Values<'_> { Values(self.0.stream()) } @@ -250,6 +263,7 @@ impl> Map { /// ]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder(self.0.range()) } @@ -301,6 +315,7 @@ impl> Map { /// Ok(()) /// } /// ``` + #[cfg(feature = "alloc")] pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder(self.0.search(aut)) } @@ -354,6 +369,7 @@ fn example() -> Result<(), Box> { ``` "## )] + #[cfg(feature = "alloc")] pub fn search_with_state( &self, aut: A, @@ -417,6 +433,7 @@ fn example() -> Result<(), Box> { /// ]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -462,6 +479,7 @@ fn example() -> Result<(), Box> { } } +#[cfg(feature = "std")] impl Default for Map> { #[inline] fn default() -> Map> { @@ -469,6 +487,7 @@ impl Default for Map> { } } +#[cfg(feature = "alloc")] impl> fmt::Debug for Map { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Map([")?; @@ -501,6 +520,7 @@ impl> AsRef> for Map { } } +#[cfg(feature = "alloc")] impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { type Item = (&'a [u8], u64); type Into = Stream<'m>; @@ -606,8 +626,10 @@ impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { /// (b"stevie".to_vec(), 3), /// ]); /// ``` +#[cfg(feature = "alloc")] pub struct MapBuilder(raw::Builder); +#[cfg(feature = "std")] impl MapBuilder> { /// Create a builder that builds a map in memory. #[inline] @@ -622,6 +644,7 @@ impl MapBuilder> { } } +#[cfg(feature = "std")] impl MapBuilder { /// Create a builder that builds a map by writing it to `wtr` in a /// streaming fashion. @@ -706,10 +729,12 @@ impl MapBuilder { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>) where A: Automaton; +#[cfg(feature = "alloc")] impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> { type Item = (&'a [u8], u64); @@ -718,6 +743,7 @@ impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> { } } +#[cfg(feature = "alloc")] impl<'m, A: Automaton> Stream<'m, A> { /// Convert this stream into a vector of byte strings and outputs. /// @@ -768,10 +794,12 @@ impl<'m, A: Automaton> Stream<'m, A> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>) where A: Automaton; +#[cfg(feature = "alloc")] impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A> where A::State: Clone, @@ -786,8 +814,10 @@ where /// A lexicographically ordered stream of keys from a map. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Keys<'m>(raw::Stream<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for Keys<'m> { type Item = &'a [u8]; @@ -801,8 +831,10 @@ impl<'a, 'm> Streamer<'a> for Keys<'m> { /// corresponding key. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Values<'m>(raw::Stream<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for Values<'m> { type Item = u64; @@ -824,8 +856,10 @@ impl<'a, 'm> Streamer<'a> for Values<'m> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>); +#[cfg(feature = "alloc")] impl<'m, A: Automaton> StreamBuilder<'m, A> { /// Specify a greater-than-or-equal-to bound. pub fn ge>(self, bound: T) -> StreamBuilder<'m, A> { @@ -848,6 +882,8 @@ impl<'m, A: Automaton> StreamBuilder<'m, A> { } } +#[cfg(feature = "alloc")] + impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> { type Item = (&'a [u8], u64); type Into = Stream<'m, A>; @@ -874,10 +910,12 @@ impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>( raw::StreamWithStateBuilder<'m, A>, ); +#[cfg(feature = "alloc")] impl<'m, A: Automaton> StreamWithStateBuilder<'m, A> { /// Specify a greater-than-or-equal-to bound. pub fn ge>( @@ -912,6 +950,7 @@ impl<'m, A: Automaton> StreamWithStateBuilder<'m, A> { } } +#[cfg(feature = "alloc")] impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'m, A> where @@ -942,8 +981,10 @@ where /// stream. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct OpBuilder<'m>(raw::OpBuilder<'m>); +#[cfg(feature = "alloc")] impl<'m> OpBuilder<'m> { /// Create a new set operation builder. #[inline] @@ -1159,6 +1200,7 @@ impl<'m> OpBuilder<'m> { } } +#[cfg(feature = "alloc")] impl<'f, I, S> Extend for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, @@ -1174,6 +1216,7 @@ where } } +#[cfg(feature = "alloc")] impl<'f, I, S> FromIterator for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, @@ -1192,8 +1235,10 @@ where /// A stream of set union over multiple map streams in lexicographic order. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Union<'m>(raw::Union<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for Union<'m> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -1207,8 +1252,10 @@ impl<'a, 'm> Streamer<'a> for Union<'m> { /// order. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Intersection<'m>(raw::Intersection<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for Intersection<'m> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -1226,8 +1273,10 @@ impl<'a, 'm> Streamer<'a> for Intersection<'m> { /// appear in any other streams. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Difference<'m>(raw::Difference<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for Difference<'m> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -1241,8 +1290,10 @@ impl<'a, 'm> Streamer<'a> for Difference<'m> { /// lexicographic order. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>); +#[cfg(feature = "alloc")] impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> { type Item = (&'a [u8], &'a [IndexedValue]); diff --git a/src/raw/build.rs b/src/raw/build.rs index e93626b2..b74448ca 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -1,16 +1,29 @@ +#[cfg(feature = "std")] use std::io; - +#[cfg(feature = "std")] use crate::bytes; +#[cfg(feature = "std")] use crate::error::Result; +#[cfg(feature = "alloc")] use crate::raw::counting_writer::CountingWriter; +#[cfg(feature = "std")] use crate::raw::error::Error; -use crate::raw::registry::{Registry, RegistryEntry}; +#[cfg(feature = "alloc")] +use crate::raw::registry::Registry; +#[cfg(feature = "std")] +use crate::raw::registry::RegistryEntry; +use crate::raw::Output; +#[cfg(feature = "alloc")] +use crate::raw::{CompiledAddr, Fst, Transition}; +#[cfg(feature = "std")] use crate::raw::{ - CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS, + FstType, EMPTY_ADDRESS, NONE_ADDRESS, VERSION, }; -// use raw::registry_minimal::{Registry, RegistryEntry}; +#[cfg(feature = "std")] use crate::stream::{IntoStreamer, Streamer}; +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, vec}; /// A builder for creating a finite state transducer. /// @@ -40,6 +53,7 @@ use crate::stream::{IntoStreamer, Streamer}; /// /// The algorithmic complexity of fst construction is `O(n)` where `n` is the /// number of elements added to the fst. +#[cfg(feature = "alloc")] pub struct Builder { /// The FST raw data is written directly to `wtr`. /// @@ -73,17 +87,20 @@ pub struct Builder { } #[derive(Debug)] +#[cfg(feature = "alloc")] struct UnfinishedNodes { stack: Vec, } #[derive(Debug)] +#[cfg(feature = "alloc")] struct BuilderNodeUnfinished { node: BuilderNode, last: Option, } #[derive(Debug, Hash, Eq, PartialEq)] +#[cfg(feature = "alloc")] pub struct BuilderNode { pub is_final: bool, pub final_output: Output, @@ -96,6 +113,7 @@ struct LastTransition { out: Output, } +#[cfg(feature = "std")] impl Builder> { /// Create a builder that builds an fst in memory. #[inline] @@ -110,6 +128,7 @@ impl Builder> { } } +#[cfg(feature = "std")] impl Builder { /// Create a builder that builds an fst by writing it to `wtr` in a /// streaming fashion. @@ -325,6 +344,7 @@ impl Builder { } } +#[cfg(feature = "alloc")] impl UnfinishedNodes { fn new() -> UnfinishedNodes { let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) }; @@ -422,6 +442,7 @@ impl UnfinishedNodes { } } +#[cfg(feature = "alloc")] impl BuilderNodeUnfinished { fn last_compiled(&mut self, addr: CompiledAddr) { if let Some(trans) = self.last.take() { @@ -446,6 +467,7 @@ impl BuilderNodeUnfinished { } } +#[cfg(feature = "alloc")] impl Clone for BuilderNode { fn clone(&self) -> BuilderNode { BuilderNode { @@ -463,6 +485,7 @@ impl Clone for BuilderNode { } } +#[cfg(feature = "alloc")] impl Default for BuilderNode { fn default() -> BuilderNode { BuilderNode { diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs index 70d9315f..1fce9bbd 100644 --- a/src/raw/counting_writer.rs +++ b/src/raw/counting_writer.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "std")] use std::io; use crate::raw::crc32::CheckSummer; @@ -9,6 +10,7 @@ pub struct CountingWriter { summer: CheckSummer, } +#[cfg(feature = "std")] impl CountingWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CountingWriter { @@ -43,6 +45,7 @@ impl CountingWriter { } } +#[cfg(feature = "std")] impl io::Write for CountingWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.summer.update(buf); diff --git a/src/raw/error.rs b/src/raw/error.rs index b2837717..35c0effa 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -1,6 +1,9 @@ -use std::fmt; -use std::str; -use std::string::FromUtf8Error; +use core::fmt; +use core::str; +#[cfg(feature = "alloc")] +use alloc::string::FromUtf8Error; +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, string::String, borrow::ToOwned, format}; use crate::raw::FstType; @@ -47,6 +50,7 @@ pub enum Error { ChecksumMissing, /// A duplicate key was inserted into a finite state transducer, which is /// not allowed. + #[cfg(feature = "alloc")] DuplicateKey { /// The duplicate key. got: Vec, @@ -54,6 +58,7 @@ pub enum Error { /// A key was inserted out of order into a finite state transducer. /// /// Keys must always be inserted in lexicographic order. + #[cfg(feature = "alloc")] OutOfOrder { /// The last key successfully inserted. previous: Vec, @@ -72,6 +77,7 @@ pub enum Error { got: FstType, }, /// An error that occurred when trying to decode a UTF-8 byte key. + #[cfg(feature = "alloc")] FromUtf8(FromUtf8Error), /// Hints that destructuring should not be exhaustive. /// @@ -85,6 +91,7 @@ pub enum Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { + #[cfg(feature = "alloc")] Error::FromUtf8(ref err) => err.fmt(f), Error::Version { expected, got } => write!( f, @@ -112,11 +119,13 @@ usually means you're trying to read data that isn't actually an encoded FST.", f, "FST verification failed: FST does not contain a checksum", ), + #[cfg(feature = "alloc")] Error::DuplicateKey { ref got } => write!( f, "Error inserting duplicate key: '{}'.", format_bytes(&*got) ), + #[cfg(feature = "alloc")] Error::OutOfOrder { ref previous, ref got } => write!( f, "\ @@ -142,15 +151,29 @@ impl fmt::Debug for Error { } } +#[cfg(feature = "std")] impl std::error::Error for Error { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match *self { + #[cfg(feature = "alloc")] Error::FromUtf8(ref err) => Some(err), _ => None, } } } +#[cfg(not(feature = "std"))] +impl core::error::Error for Error { + fn source(&self) -> Option<&(dyn core::error::Error + 'static)> { + match *self { + #[cfg(feature = "alloc")] + Error::FromUtf8(ref err) => Some(err), + _ => None, + } + } +} + +#[cfg(feature = "alloc")] impl From for Error { #[inline] fn from(err: FromUtf8Error) -> Error { @@ -163,6 +186,7 @@ impl From for Error { /// /// Essentially, try to decode the bytes as UTF-8 and show that. Failing that, /// just show the sequence of bytes. +#[cfg(feature = "alloc")] fn format_bytes(bytes: &[u8]) -> String { match str::from_utf8(bytes) { Ok(s) => s.to_owned(), diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 6cc0eeb2..918f0f31 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -18,21 +18,23 @@ option of specifying a merge strategy for output values. Most of the rest of the types are streams from set operations. */ -use std::cmp; -use std::fmt; +use core::cmp; +use core::fmt; use crate::automaton::{AlwaysMatch, Automaton}; use crate::bytes; use crate::error::Result; use crate::stream::{IntoStreamer, Streamer}; +#[cfg(feature = "alloc")] pub use crate::raw::build::Builder; pub use crate::raw::error::Error; pub use crate::raw::node::{Node, Transitions}; -pub use crate::raw::ops::{ - Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, - Union, -}; +pub use crate::raw::ops::IndexedValue; +#[cfg(feature = "alloc")] +pub use crate::raw::ops::{Difference, Intersection, OpBuilder, SymmetricDifference, Union}; +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, vec, borrow::ToOwned, string::String}; mod build; mod common_inputs; @@ -282,6 +284,7 @@ struct Meta { checksum: Option, } +#[cfg(feature = "alloc")] impl Fst> { /// Create a new FST from an iterator of lexicographically ordered byte /// strings. Every key's value is set to `0`. @@ -292,6 +295,7 @@ impl Fst> { /// Note that this is a convenience function to build an FST in memory. /// To build an FST that streams to an arbitrary `io::Write`, use /// `raw::Builder`. + #[cfg(feature = "std")] pub fn from_iter_set(iter: I) -> Result>> where K: AsRef<[u8]>, @@ -314,6 +318,7 @@ impl Fst> { /// Note that this is a convenience function to build an FST in memory. /// To build an FST that streams to an arbitrary `io::Write`, use /// `raw::Builder`. + #[cfg(feature = "std")] pub fn from_iter_map(iter: I) -> Result>> where K: AsRef<[u8]>, @@ -429,6 +434,7 @@ impl> Fst { /// The values in this FST are not monotonically increasing when sorted /// lexicographically by key, then this routine has unspecified behavior. #[inline] + #[cfg(feature = "alloc")] pub fn get_key(&self, value: u64) -> Option> { let mut key = vec![]; if self.get_key_into(value, &mut key) { @@ -448,6 +454,7 @@ impl> Fst { /// The values in this FST are not monotonically increasing when sorted /// lexicographically by key, then this routine has unspecified behavior. #[inline] + #[cfg(feature = "alloc")] pub fn get_key_into(&self, value: u64, key: &mut Vec) -> bool { self.as_ref().get_key_into(value, key) } @@ -455,6 +462,7 @@ impl> Fst { /// Return a lexicographically ordered stream of all key-value pairs in /// this fst. #[inline] + #[cfg(feature = "alloc")] pub fn stream(&self) -> Stream<'_> { StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() } @@ -464,12 +472,14 @@ impl> Fst { /// A range query returns a subset of key-value pairs in this fst in a /// range given in lexicographic order. #[inline] + #[cfg(feature = "alloc")] pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder::new(self.as_ref(), AlwaysMatch) } /// Executes an automaton on the keys of this FST. #[inline] + #[cfg(feature = "alloc")] pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder::new(self.as_ref(), aut) } @@ -478,6 +488,7 @@ impl> Fst { /// keys along with the corresponding matching states in the given /// automaton. #[inline] + #[cfg(feature = "alloc")] pub fn search_with_state( &self, aut: A, @@ -538,6 +549,7 @@ impl> Fst { /// symmetric difference on the keys of the fst. These set operations also /// allow one to specify how conflicting values are merged in the stream. #[inline] + #[cfg(feature = "alloc")] pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -548,6 +560,7 @@ impl> Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. #[inline] + #[cfg(feature = "alloc")] pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -562,6 +575,7 @@ impl> Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. #[inline] + #[cfg(feature = "alloc")] pub fn is_subset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -581,6 +595,7 @@ impl> Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. #[inline] + #[cfg(feature = "alloc")] pub fn is_superset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -622,6 +637,7 @@ impl> Fst { /// Returns a copy of the binary contents of this FST. #[inline] + #[cfg(feature = "alloc")] pub fn to_vec(&self) -> Vec { self.as_ref().to_vec() } @@ -662,11 +678,13 @@ impl Fst { } } +#[cfg(feature = "alloc")] impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst { type Item = (&'a [u8], Output); type Into = Stream<'f>; #[inline] + #[cfg(feature = "alloc")] fn into_stream(self) -> Stream<'f> { StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() } @@ -712,6 +730,7 @@ impl<'f> FstRef<'f> { } #[inline] + #[cfg(feature = "alloc")] fn get_key_into(&self, mut value: u64, key: &mut Vec) -> bool { let mut node = self.root(); while value != 0 || !node.is_final() { @@ -767,6 +786,7 @@ impl<'f> FstRef<'f> { } #[inline] + #[cfg(feature = "alloc")] fn to_vec(&self) -> Vec { self.as_bytes().to_vec() } @@ -799,6 +819,7 @@ impl<'f> FstRef<'f> { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct StreamBuilder<'f, A = AlwaysMatch> { fst: FstRef<'f>, aut: A, @@ -806,6 +827,7 @@ pub struct StreamBuilder<'f, A = AlwaysMatch> { max: Bound, } +#[cfg(feature = "alloc")] impl<'f, A: Automaton> StreamBuilder<'f, A> { fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> { StreamBuilder { @@ -817,30 +839,35 @@ impl<'f, A: Automaton> StreamBuilder<'f, A> { } /// Specify a greater-than-or-equal-to bound. + #[cfg(feature = "alloc")] pub fn ge>(mut self, bound: T) -> StreamBuilder<'f, A> { self.min = Bound::Included(bound.as_ref().to_owned()); self } /// Specify a greater-than bound. + #[cfg(feature = "alloc")] pub fn gt>(mut self, bound: T) -> StreamBuilder<'f, A> { self.min = Bound::Excluded(bound.as_ref().to_owned()); self } /// Specify a less-than-or-equal-to bound. + #[cfg(feature = "alloc")] pub fn le>(mut self, bound: T) -> StreamBuilder<'f, A> { self.max = Bound::Included(bound.as_ref().to_owned()); self } /// Specify a less-than bound. + #[cfg(feature = "alloc")] pub fn lt>(mut self, bound: T) -> StreamBuilder<'f, A> { self.max = Bound::Excluded(bound.as_ref().to_owned()); self } } +#[cfg(feature = "alloc")] impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> { type Item = (&'a [u8], Output); type Into = Stream<'f, A>; @@ -867,6 +894,7 @@ impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> { fst: FstRef<'f>, aut: A, @@ -874,6 +902,7 @@ pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> { max: Bound, } +#[cfg(feature = "alloc")] impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> { fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> { StreamWithStateBuilder { @@ -921,6 +950,7 @@ impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> { } } +#[cfg(feature = "alloc")] impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'f, A> where @@ -935,12 +965,14 @@ where } #[derive(Debug)] +#[cfg(feature = "alloc")] enum Bound { Included(Vec), Excluded(Vec), Unbounded, } +#[cfg(feature = "alloc")] impl Bound { #[inline] fn exceeded_by(&self, inp: &[u8]) -> bool { @@ -975,8 +1007,10 @@ impl Bound { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>); +#[cfg(feature = "alloc")] impl<'f, A: Automaton> Stream<'f, A> { fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> { Stream(StreamWithState::new(fst, aut, min, max)) @@ -1044,6 +1078,7 @@ impl<'f, A: Automaton> Stream<'f, A> { } } +#[cfg(feature = "alloc")] impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { type Item = (&'a [u8], Output); @@ -1062,6 +1097,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct StreamWithState<'f, A = AlwaysMatch> where A: Automaton, @@ -1082,6 +1118,7 @@ struct StreamState<'f, S> { aut_state: S, } +#[cfg(feature = "alloc")] impl<'f, A: Automaton> StreamWithState<'f, A> { fn new( fst: FstRef<'f>, @@ -1248,6 +1285,7 @@ impl<'f, A: Automaton> StreamWithState<'f, A> { } } +#[cfg(feature = "alloc")] impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A> where A::State: Clone, @@ -1368,7 +1406,7 @@ fn u64_to_usize(n: u64) -> usize { #[inline] #[cfg(not(target_pointer_width = "64"))] fn u64_to_usize(n: u64) -> usize { - if n > std::usize::MAX as u64 { + if n > core::usize::MAX as u64 { panic!( "\ Cannot convert node address {} to a pointer sized variable. If this FST diff --git a/src/raw/node.rs b/src/raw/node.rs index 820b29ea..455996f6 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -1,9 +1,12 @@ -use std::cmp; -use std::fmt; +#[cfg(feature = "std")] +use core::cmp; +use core::fmt; +#[cfg(feature = "std")] use std::io; -use std::ops::Range; +use core::ops::Range; use crate::bytes; +#[cfg(feature = "std")] use crate::raw::build::BuilderNode; use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; use crate::raw::{ @@ -241,6 +244,7 @@ impl<'f> Node<'f> { } } + #[cfg(feature = "std")] fn compile( wtr: W, last_addr: CompiledAddr, @@ -265,6 +269,7 @@ impl<'f> Node<'f> { } } +#[cfg(feature = "std")] impl BuilderNode { pub fn compile_to( &self, @@ -309,6 +314,7 @@ impl State { } impl StateOneTransNext { + #[cfg(feature = "std")] fn compile( mut wtr: W, _: CompiledAddr, @@ -368,6 +374,7 @@ impl StateOneTransNext { } impl StateOneTrans { + #[cfg(feature = "std")] fn compile( mut wtr: W, addr: CompiledAddr, @@ -466,6 +473,7 @@ impl StateOneTrans { } impl StateAnyTrans { + #[cfg(feature = "std")] fn compile( mut wtr: W, addr: CompiledAddr, @@ -824,6 +832,7 @@ fn common_input(idx: u8) -> Option { } #[inline] +#[cfg(feature = "std")] fn pack_delta( wtr: W, node_addr: CompiledAddr, @@ -835,6 +844,7 @@ fn pack_delta( } #[inline] +#[cfg(feature = "std")] fn pack_delta_in( wtr: W, node_addr: CompiledAddr, diff --git a/src/raw/ops.rs b/src/raw/ops.rs index 9baf8554..a8215321 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -1,11 +1,15 @@ -use std::cmp; -use std::collections::BinaryHeap; -use std::iter::FromIterator; +use core::cmp; +#[cfg(feature = "alloc")] +use alloc::{collections::BinaryHeap, boxed::Box}; +#[cfg(feature = "alloc")] +use alloc::{vec, vec::Vec}; +use core::iter::FromIterator; use crate::raw::Output; use crate::stream::{IntoStreamer, Streamer}; /// Permits stream operations to be hetergeneous with respect to streams. +#[cfg(feature = "alloc")] type BoxedStream<'f> = Box Streamer<'a, Item = (&'a [u8], Output)> + 'f>; @@ -41,10 +45,12 @@ pub struct IndexedValue { /// stream. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct OpBuilder<'f> { streams: Vec>, } +#[cfg(feature = "alloc")] impl<'f> OpBuilder<'f> { /// Create a new set operation builder. #[inline] @@ -168,6 +174,7 @@ impl<'f> OpBuilder<'f> { } } +#[cfg(feature = "alloc")] impl<'f, I, S> Extend for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -183,6 +190,7 @@ where } } +#[cfg(feature = "alloc")] impl<'f, I, S> FromIterator for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -201,12 +209,14 @@ where /// A stream of set union over multiple fst streams in lexicographic order. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying map. +#[cfg(feature = "alloc")] pub struct Union<'f> { heap: StreamHeap<'f>, outs: Vec, cur_slot: Option, } +#[cfg(feature = "alloc")] impl<'a, 'f> Streamer<'a> for Union<'f> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -235,12 +245,14 @@ impl<'a, 'f> Streamer<'a> for Union<'f> { /// order. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct Intersection<'f> { heap: StreamHeap<'f>, outs: Vec, cur_slot: Option, } +#[cfg(feature = "alloc")] impl<'a, 'f> Streamer<'a> for Intersection<'f> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -280,6 +292,7 @@ impl<'a, 'f> Streamer<'a> for Intersection<'f> { /// appear in any other streams. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct Difference<'f> { set: BoxedStream<'f>, key: Vec, @@ -287,6 +300,7 @@ pub struct Difference<'f> { outs: Vec, } +#[cfg(feature = "alloc")] impl<'a, 'f> Streamer<'a> for Difference<'f> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -320,12 +334,14 @@ impl<'a, 'f> Streamer<'a> for Difference<'f> { /// lexicographic order. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +#[cfg(feature = "alloc")] pub struct SymmetricDifference<'f> { heap: StreamHeap<'f>, outs: Vec, cur_slot: Option, } +#[cfg(feature = "alloc")] impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> { type Item = (&'a [u8], &'a [IndexedValue]); @@ -359,11 +375,13 @@ impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> { } } +#[cfg(feature = "alloc")] struct StreamHeap<'f> { rdrs: Vec>, heap: BinaryHeap, } +#[cfg(feature = "alloc")] impl<'f> StreamHeap<'f> { fn new(streams: Vec>) -> StreamHeap<'f> { let mut u = StreamHeap { rdrs: streams, heap: BinaryHeap::new() }; @@ -410,6 +428,7 @@ impl<'f> StreamHeap<'f> { } } +#[cfg(feature = "alloc")] #[derive(Debug, Eq, PartialEq)] struct Slot { idx: usize, @@ -417,6 +436,7 @@ struct Slot { output: Output, } +#[cfg(feature = "alloc")] impl Slot { fn new(rdr_idx: usize) -> Slot { Slot { @@ -444,6 +464,7 @@ impl Slot { } } +#[cfg(feature = "alloc")] impl PartialOrd for Slot { fn partial_cmp(&self, other: &Slot) -> Option { (&self.input, self.output) @@ -452,6 +473,7 @@ impl PartialOrd for Slot { } } +#[cfg(feature = "alloc")] impl Ord for Slot { fn cmp(&self, other: &Slot) -> cmp::Ordering { self.partial_cmp(other).unwrap() diff --git a/src/raw/registry.rs b/src/raw/registry.rs index 3b00a5b0..bc027503 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -1,7 +1,10 @@ +#[cfg(feature = "alloc")] use crate::raw::build::BuilderNode; use crate::raw::{CompiledAddr, NONE_ADDRESS}; - +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, vec}; #[derive(Debug)] +#[cfg(feature = "alloc")] pub struct Registry { table: Vec, table_size: usize, // number of rows @@ -9,23 +12,27 @@ pub struct Registry { } #[derive(Debug)] +#[cfg(feature = "alloc")] struct RegistryCache<'a> { cells: &'a mut [RegistryCell], } #[derive(Clone, Debug)] +#[cfg(feature = "alloc")] pub struct RegistryCell { addr: CompiledAddr, node: BuilderNode, } #[derive(Debug)] +#[cfg(feature = "alloc")] pub enum RegistryEntry<'a> { Found(CompiledAddr), NotFound(&'a mut RegistryCell), Rejected, } +#[cfg(feature = "alloc")] impl Registry { pub fn new(table_size: usize, mru_size: usize) -> Registry { let empty_cell = RegistryCell::none(); @@ -62,6 +69,7 @@ impl Registry { } } +#[cfg(feature = "alloc")] impl<'a> RegistryCache<'a> { fn entry(mut self, node: &BuilderNode) -> RegistryEntry<'a> { if self.cells.len() == 1 { @@ -112,6 +120,7 @@ impl<'a> RegistryCache<'a> { } } +#[cfg(feature = "alloc")] impl RegistryCell { fn none() -> RegistryCell { RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() } diff --git a/src/raw/registry_minimal.rs b/src/raw/registry_minimal.rs index 663b0123..207c6751 100644 --- a/src/raw/registry_minimal.rs +++ b/src/raw/registry_minimal.rs @@ -10,13 +10,14 @@ // expensive SipHasher. #![allow(dead_code)] - +#[cfg(feature = "std")] use std::collections::hash_map::{Entry, HashMap}; - +#[cfg(feature = "std")] use crate::raw::build::BuilderNode; use crate::raw::CompiledAddr; #[derive(Debug)] +#[cfg(feature = "std")] pub struct Registry { table: HashMap, } @@ -31,6 +32,7 @@ pub enum RegistryEntry<'a> { #[derive(Clone, Copy, Debug)] pub struct RegistryCell(CompiledAddr); +#[cfg(feature = "std")] impl Registry { pub fn new(table_size: usize, _lru_size: usize) -> Registry { Registry { table: HashMap::with_capacity(table_size) } diff --git a/src/set.rs b/src/set.rs index 3b1b90df..5ed31cfe 100644 --- a/src/set.rs +++ b/src/set.rs @@ -1,6 +1,10 @@ -use std::fmt; +#[cfg(feature = "alloc")] +use core::fmt; +#[cfg(feature = "std")] use std::io; -use std::iter::{self, FromIterator}; +use core::iter::{self, FromIterator}; +#[cfg(feature = "alloc")] +use alloc::{vec::Vec, string::String}; use crate::automaton::{AlwaysMatch, Automaton}; use crate::raw; @@ -29,6 +33,7 @@ use crate::Result; #[derive(Clone)] pub struct Set(raw::Fst); +#[cfg(feature = "alloc")] impl Set> { /// Create a `Set` from an iterator of lexicographically ordered byte /// strings. @@ -39,6 +44,7 @@ impl Set> { /// Note that this is a convenience function to build a set in memory. /// To build a set that streams to an arbitrary `io::Write`, use /// `SetBuilder`. + #[cfg(feature = "std")] pub fn from_iter(iter: I) -> Result>> where T: AsRef<[u8]>, @@ -119,6 +125,7 @@ impl> Set { /// assert_eq!(keys, vec![b"a", b"b", b"c"]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn stream(&self) -> Stream<'_> { Stream(self.0.stream()) } @@ -149,6 +156,7 @@ impl> Set { /// assert_eq!(keys, vec![b"b", b"c", b"d"]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder(self.0.range()) } @@ -189,6 +197,7 @@ impl> Set { /// Ok(()) /// } /// ``` + #[cfg(feature = "alloc")] pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder(self.0.search(aut)) } @@ -242,6 +251,7 @@ fn example() -> Result<(), Box> { ``` "## )] + #[cfg(feature = "alloc")] pub fn search_with_state( &self, aut: A, @@ -284,6 +294,7 @@ fn example() -> Result<(), Box> { /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]); /// ``` #[inline] + #[cfg(feature = "alloc")] pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -307,6 +318,7 @@ fn example() -> Result<(), Box> { /// /// assert_eq!(set1.is_disjoint(&set3), false); /// ``` + #[cfg(feature = "alloc")] pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -334,6 +346,7 @@ fn example() -> Result<(), Box> { /// assert_eq!(set1.is_subset(&set3), false); /// assert_eq!(set3.is_subset(&set1), true); /// ``` + #[cfg(feature = "alloc")] pub fn is_subset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -361,6 +374,7 @@ fn example() -> Result<(), Box> { /// assert_eq!(set1.is_superset(&set3), true); /// assert_eq!(set3.is_superset(&set1), false); /// ``` + #[cfg(feature = "alloc")] pub fn is_superset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -410,6 +424,7 @@ fn example() -> Result<(), Box> { } } +#[cfg(feature = "std")] impl Default for Set> { #[inline] fn default() -> Set> { @@ -417,6 +432,7 @@ impl Default for Set> { } } +#[cfg(feature = "alloc")] impl> fmt::Debug for Set { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Set([")?; @@ -441,6 +457,7 @@ impl> AsRef> for Set { } } +#[cfg(feature = "alloc")] impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set { type Item = &'a [u8]; type Into = Stream<'s>; @@ -548,8 +565,10 @@ impl> From> for Set { /// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(), /// ]); /// ``` +#[cfg(feature = "alloc")] pub struct SetBuilder(raw::Builder); +#[cfg(feature = "std")] impl SetBuilder> { /// Create a builder that builds a set in memory. #[inline] @@ -564,6 +583,7 @@ impl SetBuilder> { } } +#[cfg(feature = "std")] impl SetBuilder { /// Create a builder that builds a set by writing it to `wtr` in a /// streaming fashion. @@ -637,10 +657,12 @@ impl SetBuilder { /// the stream. By default, no filtering is done. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>) where A: Automaton; +#[cfg(feature = "alloc")] impl<'s, A: Automaton> Stream<'s, A> { /// Convert this stream into a vector of Unicode strings. /// @@ -660,6 +682,7 @@ impl<'s, A: Automaton> Stream<'s, A> { } } +#[cfg(feature = "alloc")] impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> { type Item = &'a [u8]; @@ -677,10 +700,12 @@ impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>) where A: Automaton; +#[cfg(feature = "alloc")] impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A> where A::State: Clone, @@ -704,8 +729,10 @@ where /// the stream. By default, no filtering is done. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>); +#[cfg(feature = "alloc")] impl<'s, A: Automaton> StreamBuilder<'s, A> { /// Specify a greater-than-or-equal-to bound. pub fn ge>(self, bound: T) -> StreamBuilder<'s, A> { @@ -728,6 +755,7 @@ impl<'s, A: Automaton> StreamBuilder<'s, A> { } } +#[cfg(feature = "alloc")] impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> { type Item = &'a [u8]; type Into = Stream<'s, A>; @@ -754,10 +782,12 @@ impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> { /// the stream. By default, no filtering is done. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>( raw::StreamWithStateBuilder<'s, A>, ); +#[cfg(feature = "alloc")] impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> { /// Specify a greater-than-or-equal-to bound. pub fn ge>( @@ -792,6 +822,7 @@ impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> { } } +#[cfg(feature = "alloc")] impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'s, A> where @@ -819,8 +850,10 @@ where /// stream. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct OpBuilder<'s>(raw::OpBuilder<'s>); +#[cfg(feature = "alloc")] impl<'s> OpBuilder<'s> { /// Create a new set operation builder. #[inline] @@ -958,6 +991,7 @@ impl<'s> OpBuilder<'s> { } } +#[cfg(feature = "alloc")] impl<'f, I, S> Extend for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -973,6 +1007,7 @@ where } } +#[cfg(feature = "alloc")] impl<'f, I, S> FromIterator for OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, @@ -991,8 +1026,10 @@ where /// A stream of set union over multiple streams in lexicographic order. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct Union<'s>(raw::Union<'s>); +#[cfg(feature = "alloc")] impl<'a, 's> Streamer<'a> for Union<'s> { type Item = &'a [u8]; @@ -1005,8 +1042,10 @@ impl<'a, 's> Streamer<'a> for Union<'s> { /// A stream of set intersection over multiple streams in lexicographic order. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct Intersection<'s>(raw::Intersection<'s>); +#[cfg(feature = "alloc")] impl<'a, 's> Streamer<'a> for Intersection<'s> { type Item = &'a [u8]; @@ -1023,8 +1062,10 @@ impl<'a, 's> Streamer<'a> for Intersection<'s> { /// appear in any other streams. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct Difference<'s>(raw::Difference<'s>); +#[cfg(feature = "alloc")] impl<'a, 's> Streamer<'a> for Difference<'s> { type Item = &'a [u8]; @@ -1038,8 +1079,10 @@ impl<'a, 's> Streamer<'a> for Difference<'s> { /// order. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. +#[cfg(feature = "alloc")] pub struct SymmetricDifference<'s>(raw::SymmetricDifference<'s>); +#[cfg(feature = "alloc")] impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> { type Item = &'a [u8]; From 03a3a3cf20281e1214c76ada687a3b0d97184088 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 1 Nov 2023 15:31:19 +0100 Subject: [PATCH 02/28] Update Cargo.toml --- Cargo.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 02d51418..4ab59d2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,9 +24,6 @@ levenshtein = ["utf8-ranges"] std = ["alloc"] alloc = [] -[patch.crates-io] -fst = { path = "." } - [dependencies] utf8-ranges = { version = "1.0.4", optional = true } From 891e8fc65ab661a6d2417e01f3d7ecf8ba929530 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 17:52:00 +0100 Subject: [PATCH 03/28] remove .idea files --- .gitignore | 1 + .idea/fst.iml | 14 ----- .idea/modules.xml | 8 --- .idea/vcs.xml | 6 --- .idea/workspace.xml | 126 -------------------------------------------- 5 files changed, 1 insertion(+), 154 deletions(-) delete mode 100644 .idea/fst.iml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/vcs.xml delete mode 100644 .idea/workspace.xml diff --git a/.gitignore b/.gitignore index 579d99f2..93dfb316 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ words dict test months +.idea \ No newline at end of file diff --git a/.idea/fst.iml b/.idea/fst.iml deleted file mode 100644 index e4ade7cf..00000000 --- a/.idea/fst.iml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 46ed0723..00000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1ddf..00000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/workspace.xml b/.idea/workspace.xml deleted file mode 100644 index 613a07ed..00000000 --- a/.idea/workspace.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - { - "associatedIndex": 6 -} - - - - - - - - - - - - - - - - - 1698689393828 - - - - - - \ No newline at end of file From 0080cec43cfbdd4f99ec6540321ec59a4603827f Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:07:59 +0100 Subject: [PATCH 04/28] Update README.md --- README.md | 51 +++++++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 308907fa..ce58e187 100644 --- a/README.md +++ b/README.md @@ -1,33 +1,13 @@ -fst +fst no-std mode === -This crate provides a fast implementation of ordered sets and maps using finite -state machines. In particular, it makes use of finite state transducers to map -keys to values as the machine is executed. Using finite state machines as data -structures enables us to store keys in a compact format that is also easily -searchable. For example, this crate leverages memory maps to make range queries -very fast. -Check out my blog post -[Index 1,600,000,000 Keys with Automata and -Rust](https://blog.burntsushi.net/transducers/) -for extensive background, examples and experiments. - -[![Build status](https://github.com/BurntSushi/fst/workflows/ci/badge.svg)](https://github.com/BurntSushi/fst/actions) -[![](https://meritbadge.herokuapp.com/fst)](https://crates.io/crates/fst) - -Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). +This is a fork of [fst](https://github.com/BurntSushi/fst) adding support for `no_std` targets (see [`no_std` usage](#no-std-usage) for details). +If you're unsure whether to use this fork or the original one: Just use the original, chances are that's more up-to-date. ### Documentation -https://docs.rs/fst - -The -[`regex-automata`](https://docs.rs/regex-automata) -crate provides implementations of the `fst::Automata` trait when its -`transducer` feature is enabled. This permits using DFAs compiled by -`regex-automata` to search finite state transducers produced by this crate. - +https://docs.rs/fst-no-std ### Installation @@ -35,19 +15,18 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: ```toml,ignore [dependencies] -fst = "0.4" +fst-no-std = "0.4" ``` - ### Example This example demonstrates building a set in memory and executing a fuzzy query -against it. You'll need `fst = "0.4"` with the `levenshtein` feature enabled in +against it. You'll need `fst_no_std = "0.4"` with the `levenshtein` feature enabled in your `Cargo.toml`. ```rust -use fst::{IntoStreamer, Set}; -use fst::automaton::Levenshtein; +use fst_no_std::{IntoStreamer, Set}; +use fst_no_std::automaton::Levenshtein; fn main() -> Result<(), Box> { // A convenient way to create sets in memory. @@ -68,9 +47,21 @@ fn main() -> Result<(), Box> { Check out the documentation for a lot more examples! - ### Cargo features +* `std` - **Enabled** by default. Adds features that depend on the standard library. +* `alloc` - **Enabled** by default. Adds features that depend on `alloc`. * `levenshtein` - **Disabled** by default. This adds the `Levenshtein` automaton to the `automaton` sub-module. This includes an additional dependency on `utf8-ranges`. + +### `no_std` Usage + +You can use this crate in `no_std` environments by disabling default features, like so: + +```toml,ignore +[dependencies] +fst-no-std = { version = "0.4", default-features = false } +``` + +This way `fst-no-std` will not depend on the standard library and not even allocate (!) at the cost of being rather kneecaped: You can not construct FSTs and the evailable querying features are limited to simple lookups. You can optionally enable the `alloc` feature which adds a dependency on the `alloc` crate (i.e. you will need a global allocator) but it enables all querying features. From 5303d43bfea03fa462b66a351befef5914f1f7d0 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:10:13 +0100 Subject: [PATCH 05/28] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index ce58e187..9ceba51f 100644 --- a/README.md +++ b/README.md @@ -65,3 +65,9 @@ fst-no-std = { version = "0.4", default-features = false } ``` This way `fst-no-std` will not depend on the standard library and not even allocate (!) at the cost of being rather kneecaped: You can not construct FSTs and the evailable querying features are limited to simple lookups. You can optionally enable the `alloc` feature which adds a dependency on the `alloc` crate (i.e. you will need a global allocator) but it enables all querying features. + +#### License + + +Licensed under the MIT license. + From f264d2153cb7d76f32aea952e9f41041b707b3e3 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:16:53 +0100 Subject: [PATCH 06/28] Update Cargo.toml --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 4ab59d2e..d9981bfd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "fst" +name = "fst-no-std" version = "0.4.7" #:version authors = ["Andrew Gallant "] description = """ @@ -15,7 +15,7 @@ license = "Unlicense/MIT" edition = "2018" [workspace] -members = ["bench", "fst-bin"] +members = ["bench"] exclude = ["fst-levenshtein", "fst-regex"] [features] From 1801a2f2c66e558672e795e141bb9767220b994f Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:17:05 +0100 Subject: [PATCH 07/28] Delete fst-bin directory --- fst-bin/Cargo.toml | 34 ---- fst-bin/src/app.rs | 417 -------------------------------------- fst-bin/src/cmd/csv.rs | 102 ---------- fst-bin/src/cmd/dot.rs | 94 --------- fst-bin/src/cmd/dupes.rs | 88 -------- fst-bin/src/cmd/fuzzy.rs | 69 ------- fst-bin/src/cmd/grep.rs | 68 ------- fst-bin/src/cmd/map.rs | 106 ---------- fst-bin/src/cmd/mod.rs | 12 -- fst-bin/src/cmd/node.rs | 31 --- fst-bin/src/cmd/range.rs | 43 ---- fst-bin/src/cmd/rust.rs | 62 ------ fst-bin/src/cmd/set.rs | 98 --------- fst-bin/src/cmd/union.rs | 51 ----- fst-bin/src/cmd/verify.rs | 24 --- fst-bin/src/main.rs | 31 --- fst-bin/src/merge.rs | 276 ------------------------- fst-bin/src/util.rs | 188 ----------------- 18 files changed, 1794 deletions(-) delete mode 100644 fst-bin/Cargo.toml delete mode 100644 fst-bin/src/app.rs delete mode 100644 fst-bin/src/cmd/csv.rs delete mode 100644 fst-bin/src/cmd/dot.rs delete mode 100644 fst-bin/src/cmd/dupes.rs delete mode 100644 fst-bin/src/cmd/fuzzy.rs delete mode 100644 fst-bin/src/cmd/grep.rs delete mode 100644 fst-bin/src/cmd/map.rs delete mode 100644 fst-bin/src/cmd/mod.rs delete mode 100644 fst-bin/src/cmd/node.rs delete mode 100644 fst-bin/src/cmd/range.rs delete mode 100644 fst-bin/src/cmd/rust.rs delete mode 100644 fst-bin/src/cmd/set.rs delete mode 100644 fst-bin/src/cmd/union.rs delete mode 100644 fst-bin/src/cmd/verify.rs delete mode 100644 fst-bin/src/main.rs delete mode 100644 fst-bin/src/merge.rs delete mode 100644 fst-bin/src/util.rs diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml deleted file mode 100644 index 9c656ed6..00000000 --- a/fst-bin/Cargo.toml +++ /dev/null @@ -1,34 +0,0 @@ -[package] -name = "fst-bin" -version = "0.4.3" #:version -authors = ["Andrew Gallant "] -description = """ -A command line tool for using finite state transducers to compactly represents -sets or maps of many strings (> 1 billion is possible). The command line tool -exposes functionality to search FSTs using regular expressions, Levenshtein -automata and range queries. -""" -documentation = "https://docs.rs/fst" -homepage = "https://github.com/BurntSushi/fst" -repository = "https://github.com/BurntSushi/fst" -keywords = ["search", "information", "retrieval", "dictionary", "map"] -license = "Unlicense/MIT" -edition = "2018" - -[[bin]] -name = "fst" -path = "src/main.rs" -doc = false - -[dependencies] -anyhow = "1.0.26" -bit-set = "0.5.3" -bstr = { version = "1.3.0", features = ["serde"] } -clap = { version = "2.33.0", default-features = false } -crossbeam-channel = "0.5.6" -csv = "1.2.0" -fst = { version = "0.4.6", features = ["levenshtein"] } -memmap2 = "0.5.10" -regex-automata = { version = "0.1.9", features = ["transducer"] } -serde = { version = "1.0.104", features = ["derive"] } -tempfile = "3.1.0" diff --git a/fst-bin/src/app.rs b/fst-bin/src/app.rs deleted file mode 100644 index f8ecb4c8..00000000 --- a/fst-bin/src/app.rs +++ /dev/null @@ -1,417 +0,0 @@ -const ABOUT: &str = "\ -A command line tool for building, searching and inspecting FSTs. -"; - -const ABOUT_CSV: &str = "\ -Emit information in CSV format about the transducer. - -If is not set, then CSV data is emitted to stdout. -"; - -const ABOUT_DOT: &str = "\ -Emit this transducer in the \"dot\" format. - -If is not set, then the \"dot\" description is emitted to stdout. - -Generally, usage of this command should look like this: - - $ fst dot your-transducer.fst | dot -Tpng > your-transducer.png - $ $YOUR_FAVORITE_IMAGE_VIEWER your-transducer.png - -Where 'dot' is a command line utility that is part of graphviz. - -If your transducer contains output values, then they are shown as labels on -transitions. Zero output values are omitted. -"; - -const ABOUT_DUPES: &str = "\ -A simple way to show duplicate nodes. - -This is meant to be a diagnostic tool to view duplicate nodes in the -transducer. Every duplicate node represents a missed opportunity for more -compression. A minimal transducer should have precisely zero duplicate nodes. - -WARNING: This stores all nodes in the transducer in memory, decompressed. This -may be expensive in both time and space depending on the size of your -transducer. - -If is omitted, then diagnostic data is emitted to stdout. -"; - -const ABOUT_FUZZY: &str = "\ -Issues a fuzzy query against the given transducer. - -A fuzzy query returns all search results within a particular edit distance -of the query given. - -WARNING: This works by building a Levenshtein automaton, which is currently -rather expensive (in time and space) with a big edit distance. This will be -improved in the future. -"; - -const ABOUT_GREP: &str = "\ -Searches a transducer with a regular expression. - -WARNING: This works by building a regular expression automaton, which can be -quite expensive depending on how big the regex is. If this becomes a problem, -consider disabling Unicode support in the regex via '(?-u)'. -"; - -const ABOUT_MAP: &str = "\ -Creates an ordered map backed by a finite state transducer. - -The input to this command should be a CSV file with exactly two columns and no -headers. The first column should be the key and the second column should be a -value that can be interpreted as an unsigned 64 bit integer. - -If your input is already sorted, then pass the --sorted flag to make -construction much faster. If you use --sorted and the data is not sorted, -then this will return an error when it sees an out-of-order key. -"; - -const ABOUT_NODE: &str = "\ -Shows a single node from the transducer. - -The input to this command is the node's address. An address may be found either -from debugging a transducer in code, or from the output of the 'fst csv' -command. - -If the address does not point to a valid node, then the executable may panic or -abort without ceremony. -"; - -const ABOUT_RANGE: &str = "\ -Issues a range query against the given transducer. - -A range query returns all search results within a particular. - -If neither the start or the end of the range is specified, then all entries -in the transducer are shown. -"; - -const ABOUT_RUST: &str = "\ -Emit Rust source code for the given FST. - -This reads the FST given and emits it as Rust source code with one -constant defined: - - {NAME}_BYTES - -And a `lazy_static!` ref for: - - {NAME} - -Where {NAME} is taken from the name given as an argument. - -The latter definition corresponds to calling `Fst::new({NAME}_BYTES)`. -This makes it possible to trivially use pre-built FSTs in your program. -"; - -const ABOUT_SET: &str = "\ -Creates an ordered set backed by a finite state transducer. - -The input to this command should be one or more files with one key per line. - -If your input is already sorted, then pass the --sorted flag to make -construction much faster. If you use --sorted and the data is not sorted, -then this will return an error when it sees an out-of-order key. -"; - -const ABOUT_UNION: &str = "\ -Unions all of the transducer inputs into a single transducer. - -Any output values are dropped. Stated differently, the resulting transducer is -always a set. -"; - -const ABOUT_VERIFY: &str = "\ -Performs verification on the FST to check its integrity. This works by -computing a checksum of the FST's underlying data and comparing it to an -expected checksum. If the checksums do not match, then it's likely that the -FST is corrupt in some fashion and must be re-generated. - -This will also return an error if this command is called on an FST without a -checksum, such as all FSTs generated prior to 'fst 0.4'. All FSTs generated -at or after 'fst 0.4' have checksums. -"; - -pub fn app() -> clap::App<'static, 'static> { - let cmd = |name, about| { - clap::SubCommand::with_name(name) - .author(clap::crate_authors!()) - .version(clap::crate_version!()) - .about(about) - }; - let pos = |name| clap::Arg::with_name(name); - let flag = |name| clap::Arg::with_name(name).long(name); - - let csv_arg_input = pos("input") - .required(true) - .help("The FST to extract information from."); - let csv_arg_output = pos("output").help( - "The CSV file to write information to. \ - When absent, print to stdout.", - ); - let csv = cmd("csv", ABOUT_CSV) - .subcommand( - cmd("edges", "Emit information about edges in an FST.") - .arg(csv_arg_input.clone()) - .arg(csv_arg_output.clone()), - ) - .subcommand( - cmd("nodes", "Emit information about nodes in an FST.") - .arg(csv_arg_input.clone()) - .arg(csv_arg_output.clone()), - ); - - let dot = cmd("dot", ABOUT_DOT) - .arg(pos("input").required(true).help("The FST to visualize.")) - .arg(pos("output").help( - "An optional file path to write Dot output to. \ - When empty, output is written to stdout.", - )) - .arg(flag("state-names").help( - " When set, states will be labeled with an arbitrary number.", - )); - - let dupes = - cmd("dupes", ABOUT_DUPES) - .arg(pos("input").required(true).help("The FST to query.")) - .arg(pos("output").help( - "An optional file path to write output to. \ - When empty, output is written to stdout.", - )) - .arg( - flag("limit") - .default_value("10") - .help("Show this many duplicate nodes."), - ) - .arg(flag("min").default_value("20").help( - "Only show duplicates nodes with this many reoccurrences.", - )); - - let fuzzy = cmd("fuzzy", ABOUT_FUZZY) - .arg(pos("input").required(true).help("The FST to query.")) - .arg(pos("query").required(true).help("The fuzzy query.")) - .arg(flag("distance").short("d").default_value("1").help( - "All terms in the FST within this distance are shown. The \ - distance is measured in the number of character insertions, \ - deletions and substitutions required to transform the query \ - to a particular term. A \"character\" in this context refers \ - to a single Unicode codepoint.", - )) - .arg( - flag("prefix") - .short("p") - .help("When set, accepts prefixes of the fuzzy query."), - ) - .arg( - flag("outputs") - .short("o") - .help("When set, output values are shown as CSV data."), - ) - .arg( - flag("start") - .short("s") - .help("Only show results greater than or equal to this."), - ) - .arg( - flag("end") - .short("e") - .help("Only show results less than or equal to this."), - ); - - let grep = cmd("grep", ABOUT_GREP) - .arg(pos("input").required(true).help("The FST to query.")) - .arg(pos("regex").required(true).help("The regex.")) - .arg( - flag("outputs") - .short("o") - .help("When set, output values are shown as CSV data."), - ) - .arg( - flag("start") - .short("s") - .help("Only show results greater than or equal to this."), - ) - .arg( - flag("end") - .short("e") - .help("Only show results less than or equal to this."), - ); - - let map = cmd("map", ABOUT_MAP) - .arg( - pos("input") - .required(true) - .multiple(true) - .help("A file containing a key per line."), - ) - .arg( - pos("output") - .required(true) - .help("The destination file path to write the FST."), - ) - .arg(flag("force").help( - "Overwrites the output if the destination file already exists.", - )) - .arg(flag("sorted").help( - "Set this if the input data is already lexicographically \ - sorted. This will make construction much faster. Note that \ - when this is set, most of the other flags (like --fd-limit \ - and --batch-size) are not relevant.", - )) - .arg(flag("max").help( - "When building an FST from unsorted data, this merges output \ - values by taking the maximum. The default is to sum them.", - )) - .arg(flag("min").help( - "When building an FST from unsorted data, this merges output \ - values by taking the minimum. The default is to sum them.", - )) - .arg(flag("fd-limit").default_value("15").help( - "The maximum number of file descriptors to have open in a \ - single worker thread.", - )) - .arg(flag("batch-size").default_value("100000").help( - "The number of keys to collect in each batch. N.B. This is the \ - primary factor in how much memory this process uses.", - )) - .arg(flag("threads").default_value("0").help( - "The number of simultaneous workers to run. The default of 0 \ - will use the number of logical CPUs reported by your system.", - )) - .arg(flag("tmp-dir").help( - "A temporary directory used to store intermediate transducers. \ - This defaults to the default temporary directory reported by \ - your system.", - )) - .arg(flag("keep-tmp-dir").help( - "Does not delete the temporary directory. Useful for debugging.", - )); - - let node = cmd("node", ABOUT_NODE) - .arg(pos("input").required(true).help("The FST to inspect.")) - .arg( - pos("node-address") - .required(true) - .help("The address of the node to print."), - ); - - let range = cmd("range", ABOUT_RANGE) - .arg( - pos("input") - .required(true) - .help("The FST to run a range query against."), - ) - .arg( - flag("outputs") - .short("o") - .help("When set, output values are shown as CSV data."), - ) - .arg( - flag("start") - .short("s") - .takes_value(true) - .help("Only show results greater than or equal to this."), - ) - .arg( - flag("end") - .short("e") - .takes_value(true) - .help("Only show results less than or equal to this."), - ); - - let rust = cmd("rust", ABOUT_RUST) - .arg( - pos("input") - .required(true) - .help("The FST to generate Rust code for."), - ) - .arg( - pos("name") - .required(true) - .help("The name of the FST to use in the Rust source code."), - ); - - let set = cmd("set", ABOUT_SET) - .arg( - pos("input") - .required(true) - .multiple(true) - .help("One or more files containing a key per line."), - ) - .arg( - pos("output") - .required(true) - .help("The destination file path to write the FST."), - ) - .arg(flag("force").help( - "Overwrites the output if the destination file already exists.", - )) - .arg(flag("sorted").help( - "Set this if the input data is already lexicographically \ - sorted. This will make construction much faster. Note that \ - when this is set, most of the other flags (like --fd-limit \ - and --batch-size) are not relevant.", - )) - .arg(flag("fd-limit").default_value("15").help( - "The maximum number of file descriptors to have open in a \ - single worker thread.", - )) - .arg(flag("batch-size").default_value("100000").help( - "The number of keys to collect in each batch. N.B. This is the \ - primary factor in how much memory this process uses.", - )) - .arg(flag("threads").default_value("0").help( - "The number of simultaneous workers to run. The default of 0 \ - will use the number of logical CPUs reported by your system.", - )) - .arg(flag("tmp-dir").help( - "A temporary directory used to store intermediate transducers. \ - This defaults to the default temporary directory reported by \ - your system.", - )) - .arg(flag("keep-tmp-dir").help( - "Does not delete the temporary directory. Useful for debugging.", - )); - - let union = cmd("union", ABOUT_UNION) - .arg( - pos("input") - .required(true) - .multiple(true) - .help("One or more files containing a key per line."), - ) - .arg( - pos("output") - .required(true) - .help("The destination file path to write the FST."), - ) - .arg(flag("force").help( - "Overwrites the output if the destination file already exists.", - )); - - let verify = cmd("verify", ABOUT_VERIFY).arg( - pos("input").required(true).multiple(true).help("The FST to verify."), - ); - - clap::App::new("fst") - .author(clap::crate_authors!()) - .version(clap::crate_version!()) - .about(ABOUT) - .max_term_width(100) - .setting(clap::AppSettings::UnifiedHelpMessage) - .subcommand(csv) - .subcommand(dot) - .subcommand(dupes) - .subcommand(fuzzy) - .subcommand(grep) - .subcommand(map) - .subcommand(node) - .subcommand(range) - .subcommand(rust) - .subcommand(set) - .subcommand(union) - .subcommand(verify) -} diff --git a/fst-bin/src/cmd/csv.rs b/fst-bin/src/cmd/csv.rs deleted file mode 100644 index 3d231951..00000000 --- a/fst-bin/src/cmd/csv.rs +++ /dev/null @@ -1,102 +0,0 @@ -use std::path::PathBuf; - -use bit_set::BitSet; -use csv; - -use crate::util; -use crate::Error; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - output: Option, - which: Which, -} - -#[derive(Debug)] -enum Which { - Edges, - Nodes, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - let (which, m) = match m.subcommand() { - ("edges", Some(m)) => (Which::Edges, m), - ("nodes", Some(m)) => (Which::Nodes, m), - (unknown, _) => { - anyhow::bail!("unrecognized csv sub-command: {}", unknown) - } - }; - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - output: m.value_of_os("output").map(PathBuf::from), - which, - }) - } - - fn run(&self) -> Result<(), Error> { - let wtr = util::get_writer(self.output.as_ref())?; - let mut wtr = csv::Writer::from_writer(wtr); - - let fst = unsafe { util::mmap_fst(&self.input)? }; - let mut set = BitSet::with_capacity(fst.len()); - - match self.which { - Which::Edges => { - wtr.serialize(("addr_in", "addr_out", "input", "output"))?; - let mut stack = vec![fst.root().addr()]; - set.insert(fst.root().addr()); - while let Some(addr) = stack.pop() { - for t in fst.node(addr).transitions() { - if !set.contains(t.addr) { - stack.push(t.addr); - set.insert(t.addr); - } - wtr.serialize(( - addr, - t.addr, - t.inp as char, - t.out.value(), - ))?; - } - } - } - Which::Nodes => { - wtr.serialize(( - "addr", - "state", - "size", - "transitions", - "final", - "final_output", - ))?; - let mut stack = vec![fst.root().addr()]; - set.insert(fst.root().addr()); - while let Some(addr) = stack.pop() { - let node = fst.node(addr); - for t in node.transitions() { - if !set.contains(t.addr) { - stack.push(t.addr); - set.insert(t.addr); - } - } - let row = &[ - node.addr().to_string(), - node.state().to_string(), - node.as_slice().len().to_string(), - node.len().to_string(), - node.is_final().to_string(), - node.final_output().value().to_string(), - ]; - wtr.write_record(row.iter())?; - } - } - } - wtr.flush().map_err(From::from) - } -} diff --git a/fst-bin/src/cmd/dot.rs b/fst-bin/src/cmd/dot.rs deleted file mode 100644 index d131bb45..00000000 --- a/fst-bin/src/cmd/dot.rs +++ /dev/null @@ -1,94 +0,0 @@ -use std::io::Write; -use std::path::PathBuf; - -use bit_set::BitSet; - -use crate::util; -use crate::Error; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - output: Option, - state_names: bool, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - output: m.value_of_os("output").map(PathBuf::from), - state_names: m.is_present("state-names"), - }) - } - - fn run(&self) -> Result<(), Error> { - let mut wtr = util::get_buf_writer(self.output.as_ref())?; - let fst = unsafe { util::mmap_fst(&self.input)? }; - let mut set = BitSet::with_capacity(fst.len()); - - let mut stack = vec![fst.root().addr()]; - - writeln!( - wtr, - r#" - digraph automaton {{ - labelloc="l"; - labeljust="l"; - rankdir="LR"; - "# - ) - .unwrap(); - let mut state_num = 0; - while let Some(addr) = stack.pop() { - if set.contains(addr) { - continue; - } - set.insert(addr); - - let node = fst.node(addr); - writeln!(wtr, "{}", self.dot_state(&node, state_num, addr)) - .unwrap(); - for t in node.transitions() { - stack.push(t.addr); - let out = if t.out.value() == 0 { - "".to_owned() - } else { - format!("/{}", t.out.value().to_string()) - }; - writeln!( - wtr, - " {} -> {} [label=\"{}{}\"];", - addr, - t.addr, - util::escape_input(t.inp), - out - ) - .unwrap(); - } - state_num += 1; - } - writeln!(wtr, "}}").unwrap(); - wtr.flush()?; - Ok(()) - } - - fn dot_state( - &self, - node: &fst::raw::Node, - i: usize, - addr: fst::raw::CompiledAddr, - ) -> String { - let label = - if self.state_names { i.to_string() } else { "".to_owned() }; - if node.is_final() { - format!(" {} [label=\"{}\",peripheries=2];", addr, label) - } else { - format!(" {} [label=\"{}\"];", addr, label) - } - } -} diff --git a/fst-bin/src/cmd/dupes.rs b/fst-bin/src/cmd/dupes.rs deleted file mode 100644 index 054761d6..00000000 --- a/fst-bin/src/cmd/dupes.rs +++ /dev/null @@ -1,88 +0,0 @@ -use std::collections::HashMap; -use std::io::Write; -use std::path::PathBuf; - -use bit_set::BitSet; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Clone, Debug, Hash, Eq, PartialEq)] -struct FullNode { - is_final: bool, - final_output: fst::raw::Output, - trans: Vec, -} - -impl FullNode { - fn from_node(node: &fst::raw::Node) -> FullNode { - FullNode { - is_final: node.is_final(), - final_output: node.final_output(), - trans: node.transitions().collect(), - } - } -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - output: Option, - limit: usize, - min: i32, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - output: m.value_of_os("output").map(PathBuf::from), - limit: m.value_of_lossy("limit").unwrap().parse()?, - min: m.value_of_lossy("min").unwrap().parse()?, - }) - } - - fn run(&self) -> Result<(), Error> { - let mut wtr = util::get_buf_writer(self.output.as_ref())?; - let fst = unsafe { util::mmap_fst(&self.input)? }; - let mut set = BitSet::with_capacity(fst.len()); - let mut node_counts = HashMap::with_capacity(10_000); - - let mut stack = vec![fst.root().addr()]; - while let Some(addr) = stack.pop() { - if set.contains(addr) { - continue; - } - set.insert(addr); - - let full_node = FullNode::from_node(&fst.node(addr)); - for t in &full_node.trans { - stack.push(t.addr); - } - *node_counts.entry(full_node).or_insert(0) += 1; - } - - let total = node_counts.values().fold(0, |n, c| n + c); - let unique = node_counts.len(); - let mut counts: Vec<(FullNode, i32)> = - node_counts.into_iter().filter(|&(_, c)| c > self.min).collect(); - counts.sort_by(|&(_, ref c1), &(_, ref c2)| c1.cmp(c2).reverse()); - - writeln!(wtr, "Total nodes: {}", total)?; - writeln!(wtr, "Unique nodes: {}", unique)?; - writeln!(wtr, "Nodes with duplicates: {}", counts.len())?; - writeln!(wtr, "----------------------------------")?; - - for &(ref fnode, count) in counts.iter().take(self.limit) { - writeln!(wtr, "Duplicated {} times", count)?; - writeln!(wtr, "{:#?}", fnode)?; - writeln!(wtr, "----------------------------------")?; - } - - wtr.flush()?; - Ok(()) - } -} diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs deleted file mode 100644 index 0c407f27..00000000 --- a/fst-bin/src/cmd/fuzzy.rs +++ /dev/null @@ -1,69 +0,0 @@ -use std::io; -use std::path::PathBuf; - -use bstr::{BString, ByteVec}; -use fst::automaton::{Automaton, Levenshtein}; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - query: String, - distance: u32, - prefix: bool, - outputs: bool, - start: Option, - end: Option, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - query: m - .value_of_os("query") - .map(|v| v.to_string_lossy().into_owned()) - .unwrap(), - distance: m.value_of_lossy("distance").unwrap().parse()?, - prefix: m.is_present("prefix"), - outputs: m.is_present("outputs"), - start: m - .value_of_os("start") - .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), - end: m - .value_of_os("end") - .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), - }) - } - - fn run(&self) -> Result<(), Error> { - let fst = unsafe { util::mmap_fst(&self.input)? }; - let lev = Levenshtein::new(&self.query, self.distance)?; - let stdout = io::BufWriter::new(io::stdout()); - - if self.prefix { - let mut q = fst.search(lev.starts_with()); - if let Some(ref start) = self.start { - q = q.ge(start); - } - if let Some(ref end) = self.end { - q = q.le(end); - } - util::print_stream(stdout, self.outputs, q) - } else { - let mut q = fst.search(lev); - if let Some(ref start) = self.start { - q = q.ge(start); - } - if let Some(ref end) = self.end { - q = q.le(end); - } - util::print_stream(stdout, self.outputs, q) - } - } -} diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs deleted file mode 100644 index 17b57338..00000000 --- a/fst-bin/src/cmd/grep.rs +++ /dev/null @@ -1,68 +0,0 @@ -use std::io; -use std::path::PathBuf; - -use bstr::{BString, ByteVec}; -use regex_automata::dense; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - regex: String, - outputs: bool, - start: Option, - end: Option, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - regex: m - .value_of_os("regex") - .map(|v| v.to_string_lossy().into_owned()) - .unwrap(), - outputs: m.is_present("outputs"), - start: m - .value_of_os("start") - .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), - end: m - .value_of_os("end") - .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), - }) - } - - fn run(&self) -> Result<(), Error> { - let reverse = std::env::var("FST_BIN_DFA_REVERSE") - .ok() - .map_or(false, |v| v == "1"); - let minimize = std::env::var("FST_BIN_DFA_MINIMIZE") - .ok() - .map_or(false, |v| v == "1"); - let fst = unsafe { util::mmap_fst(&self.input)? }; - let dense_dfa = dense::Builder::new() - .anchored(true) - .minimize(minimize) - .byte_classes(true) - .premultiply(true) - .reverse(reverse) - .build(&self.regex)?; - let dfa = match dense_dfa { - dense::DenseDFA::PremultipliedByteClass(dfa) => dfa, - _ => unreachable!(), - }; - let mut q = fst.search(&dfa); - if let Some(ref start) = self.start { - q = q.ge(start); - } - if let Some(ref end) = self.end { - q = q.le(end); - } - util::print_stream(io::BufWriter::new(io::stdout()), self.outputs, q) - } -} diff --git a/fst-bin/src/cmd/map.rs b/fst-bin/src/cmd/map.rs deleted file mode 100644 index d6e356e2..00000000 --- a/fst-bin/src/cmd/map.rs +++ /dev/null @@ -1,106 +0,0 @@ -use std::cmp; -use std::fs; -use std::path::{Path, PathBuf}; - -use fst::MapBuilder; -use serde::Deserialize; - -use crate::merge::Merger; -use crate::util; -use crate::Error; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug, Deserialize)] -struct Args { - input: Vec, - output: PathBuf, - force: bool, - sorted: bool, - fd_limit: u32, - batch_size: u32, - threads: Option, - tmp_dir: Option, - keep_tmp_dir: bool, - max: bool, - min: bool, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - let threads = m.value_of_lossy("threads").unwrap().parse()?; - Ok(Args { - input: m - .values_of_os("input") - .unwrap() - .map(PathBuf::from) - .collect(), - output: m.value_of_os("output").map(PathBuf::from).unwrap(), - force: m.is_present("force"), - sorted: m.is_present("sorted"), - fd_limit: m.value_of_lossy("fd-limit").unwrap().parse()?, - batch_size: m.value_of_lossy("batch-size").unwrap().parse()?, - threads: if threads == 0 { None } else { Some(threads) }, - tmp_dir: m.value_of_os("tmp-dir").map(PathBuf::from), - keep_tmp_dir: m.is_present("keep-tmp-dir"), - max: m.is_present("max"), - min: m.is_present("min"), - }) - } - - fn run(&self) -> Result<(), Error> { - if !self.force && fs::metadata(&self.output).is_ok() { - anyhow::bail!("Output file already exists: {:?}", self.output); - } - if self.sorted { - self.run_sorted() - } else { - self.run_unsorted() - } - } - - fn run_sorted(&self) -> Result<(), Error> { - let wtr = util::get_buf_writer(Some(&self.output))?; - let mut map = MapBuilder::new(wtr)?; - for input in &self.input { - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .from_reader(util::get_reader(Some(input))?); - for row in rdr.deserialize() { - let (key, val): (String, u64) = row?; - map.insert(key, val)?; - } - } - map.finish().map_err(From::from) - } - - fn run_unsorted(&self) -> Result<(), Error> { - let inputs = self - .input - .iter() - .map(|inp| Path::new(inp).to_path_buf()) - .collect(); - let keys = util::ConcatCsv::new(inputs); - - let mut merger = Merger::new(keys, &self.output); - merger = merger.fd_limit(self.fd_limit); - merger = merger.batch_size(self.batch_size); - merger = merger.keep_tmp_dir(self.keep_tmp_dir); - if let Some(threads) = self.threads { - merger = merger.threads(threads); - } - if let Some(ref tmp_dir) = self.tmp_dir { - merger = merger.tmp_dir(tmp_dir); - } - if self.max { - merger = merger.value_merger(|x, y| cmp::max(x, y)); - } else if self.min { - merger = merger.value_merger(|x, y| cmp::min(x, y)); - } else { - merger = merger.value_merger(|x, y| x + y); - } - merger.merge() - } -} diff --git a/fst-bin/src/cmd/mod.rs b/fst-bin/src/cmd/mod.rs deleted file mode 100644 index 5085baa7..00000000 --- a/fst-bin/src/cmd/mod.rs +++ /dev/null @@ -1,12 +0,0 @@ -pub mod csv; -pub mod dot; -pub mod dupes; -pub mod fuzzy; -pub mod grep; -pub mod map; -pub mod node; -pub mod range; -pub mod rust; -pub mod set; -pub mod union; -pub mod verify; diff --git a/fst-bin/src/cmd/node.rs b/fst-bin/src/cmd/node.rs deleted file mode 100644 index 84a320ba..00000000 --- a/fst-bin/src/cmd/node.rs +++ /dev/null @@ -1,31 +0,0 @@ -use std::io::Write; -use std::path::PathBuf; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - node_address: usize, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - node_address: m.value_of_lossy("node-address").unwrap().parse()?, - }) - } - - fn run(&self) -> Result<(), Error> { - let mut wtr = util::get_buf_writer::<&str>(None)?; - let fst = unsafe { util::mmap_fst(&self.input)? }; - let node = fst.node(self.node_address); - writeln!(wtr, "{:?}", node)?; - Ok(()) - } -} diff --git a/fst-bin/src/cmd/range.rs b/fst-bin/src/cmd/range.rs deleted file mode 100644 index d104ab2e..00000000 --- a/fst-bin/src/cmd/range.rs +++ /dev/null @@ -1,43 +0,0 @@ -use std::ffi::OsString; -use std::io; -use std::path::PathBuf; - -use bstr::ByteVec; - -use crate::util; -use crate::Error; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - outputs: bool, - start: Option, - end: Option, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - outputs: m.is_present("outputs"), - start: m.value_of_os("start").map(|v| v.to_os_string()), - end: m.value_of_os("end").map(|v| v.to_os_string()), - }) - } - - fn run(&self) -> Result<(), Error> { - let fst = unsafe { util::mmap_fst(&self.input)? }; - let mut q = fst.range(); - if let Some(ref start) = self.start { - q = q.ge(Vec::from_os_str_lossy(start)); - } - if let Some(ref end) = self.end { - q = q.le(Vec::from_os_str_lossy(end)); - } - util::print_stream(io::BufWriter::new(io::stdout()), self.outputs, q) - } -} diff --git a/fst-bin/src/cmd/rust.rs b/fst-bin/src/cmd/rust.rs deleted file mode 100644 index f6642d94..00000000 --- a/fst-bin/src/cmd/rust.rs +++ /dev/null @@ -1,62 +0,0 @@ -use std::io::{Read, Write}; -use std::path::PathBuf; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, - name: String, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m.value_of_os("input").map(PathBuf::from).unwrap(), - name: m - .value_of_os("name") - .map(|v| v.to_string_lossy().into_owned()) - .unwrap(), - }) - } - - fn run(&self) -> Result<(), Error> { - let mut wtr = util::get_buf_writer::<&str>(None)?; - let mut rdr = util::get_buf_reader(Some(&self.input))?; - - let mut bytes = vec![]; - rdr.read_to_end(&mut bytes)?; - - writeln!(wtr, "lazy_static! {{")?; - writeln!(wtr, " pub static ref {}: ::fst::raw::Fst = ", self.name)?; - writeln!( - wtr, - " ::fst::raw::Fst::new({}_BYTES).unwrap();", - self.name - )?; - writeln!(wtr, "}}\n")?; - - writeln!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", self.name)?; - let mut column = 0; - for b in bytes { - let escaped = if (b as char).is_whitespace() { - format!("\\x{:02x}", b) - } else { - util::escape_input(b) - }; - if column + escaped.len() >= 79 { - column = 0; - write!(wtr, "\\\n")?; - } - column += escaped.len(); - write!(wtr, "{}", escaped)?; - } - writeln!(wtr, "\\\n\";")?; - - Ok(()) - } -} diff --git a/fst-bin/src/cmd/set.rs b/fst-bin/src/cmd/set.rs deleted file mode 100644 index b61cfe3f..00000000 --- a/fst-bin/src/cmd/set.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::fs; -use std::io; -use std::path::{Path, PathBuf}; - -use bstr::io::BufReadExt; -use fst::SetBuilder; - -use crate::merge::Merger; -use crate::util; -use crate::Error; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: Vec, - output: PathBuf, - force: bool, - sorted: bool, - fd_limit: u32, - batch_size: u32, - threads: Option, - tmp_dir: Option, - keep_tmp_dir: bool, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - let threads = m.value_of_lossy("threads").unwrap().parse()?; - Ok(Args { - input: m - .values_of_os("input") - .unwrap() - .map(PathBuf::from) - .collect(), - output: m.value_of_os("output").map(PathBuf::from).unwrap(), - force: m.is_present("force"), - sorted: m.is_present("sorted"), - fd_limit: m.value_of_lossy("fd-limit").unwrap().parse()?, - batch_size: m.value_of_lossy("batch-size").unwrap().parse()?, - threads: if threads == 0 { None } else { Some(threads) }, - tmp_dir: m.value_of_os("tmp-dir").map(PathBuf::from), - keep_tmp_dir: m.is_present("keep-tmp-dir"), - }) - } - - fn run(&self) -> Result<(), Error> { - if !self.force && fs::metadata(&self.output).is_ok() { - anyhow::bail!("Output file already exists: {:?}", self.output); - } - if self.sorted { - self.run_sorted() - } else { - self.run_unsorted() - } - } - - fn run_sorted(&self) -> Result<(), Error> { - let wtr = util::get_buf_writer(Some(&self.output))?; - let mut set = SetBuilder::new(wtr)?; - for input in &self.input { - let mut rdr = util::get_buf_reader(Some(input))?; - rdr.for_byte_line(|line| { - if line.is_empty() { - return Ok(false); - } - set.insert(line) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - Ok(true) - })?; - } - set.finish().map_err(From::from) - } - - fn run_unsorted(&self) -> Result<(), Error> { - let inputs = self - .input - .iter() - .map(|inp| Path::new(inp).to_path_buf()) - .collect(); - let keys = util::ConcatLines::new(inputs) - .map(|result| result.map(|line| (line, 0)).map_err(From::from)); - - let mut merger = Merger::new(keys, &self.output); - merger = merger.fd_limit(self.fd_limit); - merger = merger.batch_size(self.batch_size); - merger = merger.keep_tmp_dir(self.keep_tmp_dir); - if let Some(threads) = self.threads { - merger = merger.threads(threads); - } - if let Some(ref tmp_dir) = self.tmp_dir { - merger = merger.tmp_dir(tmp_dir); - } - merger.merge() - } -} diff --git a/fst-bin/src/cmd/union.rs b/fst-bin/src/cmd/union.rs deleted file mode 100644 index 6c1fffd8..00000000 --- a/fst-bin/src/cmd/union.rs +++ /dev/null @@ -1,51 +0,0 @@ -use std::fs; -use std::path::PathBuf; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: Vec, - output: PathBuf, - force: bool, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { - input: m - .values_of_os("input") - .unwrap() - .map(PathBuf::from) - .collect(), - output: m.value_of_os("output").map(PathBuf::from).unwrap(), - force: m.is_present("force"), - }) - } - - fn run(&self) -> Result<(), Error> { - if !self.force && fs::metadata(&self.output).is_ok() { - anyhow::bail!( - "Output file already exists: {}", - self.output.display() - ); - } - - let wtr = util::get_buf_writer(Some(&self.output))?; - let mut merged = fst::SetBuilder::new(wtr)?; - - let mut sets = vec![]; - for set_path in &self.input { - let fst = unsafe { util::mmap_fst(set_path)? }; - sets.push(fst::Set::from(fst)); - } - let union = sets.iter().collect::().union(); - merged.extend_stream(union)?; - merged.finish()?; - Ok(()) - } -} diff --git a/fst-bin/src/cmd/verify.rs b/fst-bin/src/cmd/verify.rs deleted file mode 100644 index 3d266d5d..00000000 --- a/fst-bin/src/cmd/verify.rs +++ /dev/null @@ -1,24 +0,0 @@ -use std::path::PathBuf; - -use crate::{util, Error}; - -pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { - Args::new(matches).and_then(|args| args.run()) -} - -#[derive(Debug)] -struct Args { - input: PathBuf, -} - -impl Args { - fn new(m: &clap::ArgMatches) -> Result { - Ok(Args { input: m.value_of_os("input").map(PathBuf::from).unwrap() }) - } - - fn run(&self) -> Result<(), Error> { - let fst = unsafe { util::mmap_fst(&self.input)? }; - fst.verify()?; - Ok(()) - } -} diff --git a/fst-bin/src/main.rs b/fst-bin/src/main.rs deleted file mode 100644 index 8d91cb4b..00000000 --- a/fst-bin/src/main.rs +++ /dev/null @@ -1,31 +0,0 @@ -use anyhow::Error; - -mod app; -mod cmd; -mod merge; -mod util; - -fn main() -> Result<(), Error> { - match crate::app::app().get_matches().subcommand() { - ("csv", Some(m)) => cmd::csv::run(m), - ("dot", Some(m)) => cmd::dot::run(m), - ("dupes", Some(m)) => cmd::dupes::run(m), - ("fuzzy", Some(m)) => cmd::fuzzy::run(m), - ("grep", Some(m)) => cmd::grep::run(m), - ("map", Some(m)) => cmd::map::run(m), - ("node", Some(m)) => cmd::node::run(m), - ("range", Some(m)) => cmd::range::run(m), - ("rust", Some(m)) => cmd::rust::run(m), - ("set", Some(m)) => cmd::set::run(m), - ("union", Some(m)) => cmd::union::run(m), - ("verify", Some(m)) => cmd::verify::run(m), - ("", _) => { - app::app().print_help()?; - println!(""); - Ok(()) - } - (unknown, _) => { - Err(anyhow::anyhow!("unrecognized command: {}", unknown)) - } - } -} diff --git a/fst-bin/src/merge.rs b/fst-bin/src/merge.rs deleted file mode 100644 index d464f51d..00000000 --- a/fst-bin/src/merge.rs +++ /dev/null @@ -1,276 +0,0 @@ -use std::cmp; -use std::env; -use std::fs::{self, File}; -use std::io; -use std::path::{Path, PathBuf}; -use std::sync::Arc; -use std::thread; - -use bstr::BString; -use crossbeam_channel as chan; -use fst::{self, raw, Streamer}; -use tempfile; - -use crate::util; -use crate::Error; - -pub struct Merger { - it: I, - output: PathBuf, - value_merger: Option u64 + Send + Sync + 'static>>, - fd_limit: u32, - batch_size: u32, - threads: u32, - tmp_dir: PathBuf, - keep_tmp_dir: bool, -} - -type KV = (BString, u64); - -impl Merger -where - I: Iterator> + Send + 'static, -{ - pub fn new(it: T, output: P) -> Merger - where - P: AsRef, - T: IntoIterator, - { - Merger { - it: it.into_iter(), - output: output.as_ref().to_path_buf(), - value_merger: None, - fd_limit: 5, - batch_size: 100_000, - threads: std::thread::available_parallelism() - .map_or(1, |x| x.get() as u32), - tmp_dir: env::temp_dir(), - keep_tmp_dir: false, - } - } - - pub fn value_merger(mut self, f: F) -> Merger - where - F: Fn(u64, u64) -> u64 + Send + Sync + 'static, - { - self.value_merger = Some(Arc::new(f)); - self - } - - pub fn fd_limit(mut self, fd_limit: u32) -> Merger { - self.fd_limit = fd_limit; - self - } - - pub fn batch_size(mut self, batch_size: u32) -> Merger { - self.batch_size = batch_size; - self - } - - pub fn threads(mut self, threads: u32) -> Merger { - self.threads = threads; - self - } - - pub fn tmp_dir>(mut self, path: P) -> Merger { - self.tmp_dir = path.as_ref().to_path_buf(); - self - } - - pub fn keep_tmp_dir(mut self, yes: bool) -> Merger { - self.keep_tmp_dir = yes; - self - } - - pub fn merge(self) -> Result<(), Error> { - let tmp_dir = tempfile::Builder::new() - .prefix("rust-fst") - .tempdir_in(&self.tmp_dir)?; - let tmp_dir_path = Arc::new(tmp_dir.path().to_path_buf()); - - // Do the initial round of creating FSTs for every batch in the input. - let batches = batcher(self.it, self.batch_size, self.threads); - let sorters = Sorters::new(self.threads); - for (i, kv_batch) in batches.into_iter().enumerate() { - sorters.create_fst(KvBatch { - tmp_dir: tmp_dir_path.clone(), - index: i, - kvs: kv_batch?, - }); - } - let mut results = sorters.results(); - - // Nothing? Create an empty FST and be done with it. - if results.is_empty() { - let wtr = io::BufWriter::new(File::create(&self.output)?); - let builder = raw::Builder::new(wtr)?; - builder.finish()?; - return Ok(()); - } - - // Now union batches of FSTs until only one remains. - // That one is our final FST with all key/values. - let mut gen = 0; - while results.len() > 1 { - let batches = batcher(results, self.fd_limit, self.threads); - let sorters = Sorters::new(self.threads); - for (i, union_batch) in batches.into_iter().enumerate() { - sorters.create_fst(UnionBatch { - tmp_dir: tmp_dir_path.clone(), - gen, - index: i, - fsts: union_batch?, - value_merger: self.value_merger.clone(), - }); - } - results = sorters.results(); - gen += 1; - } - assert_eq!(results.len(), 1); - fs::copy(results.pop().unwrap()?, &self.output)?; - if self.keep_tmp_dir { - drop(tmp_dir.into_path()); - } - Ok(()) - } -} - -fn batcher( - it: I, - batch_size: u32, - threads: u32, -) -> chan::Receiver, Error>> -where - T: Send + 'static, - IT: Iterator> + Send + 'static, - I: IntoIterator + Send + 'static, -{ - let batch_size = batch_size as usize; - let (send, recv) = chan::bounded(cmp::min(1, threads as usize / 3)); - let it = it.into_iter(); - thread::spawn(move || { - let mut batch = Vec::with_capacity(batch_size); - for item in it { - match item { - Err(err) => { - send.send(Err(From::from(err))).unwrap(); - return; - } - Ok(item) => { - batch.push(item); - if batch.len() >= batch_size { - send.send(Ok(batch)).unwrap(); - batch = Vec::with_capacity(batch_size); - } - } - } - } - if !batch.is_empty() { - send.send(Ok(batch)).unwrap(); - } - }); - recv -} - -struct Sorters { - send: chan::Sender, - results: chan::Receiver>>, -} - -impl Sorters { - fn new(threads: u32) -> Sorters { - let (bsend, brecv) = chan::bounded::(0); - let (rsend, rrecv) = chan::bounded(0); - for _ in 0..threads { - let brecv = brecv.clone(); - let rsend = rsend.clone(); - thread::spawn(move || { - let mut results = vec![]; - for mut batch in brecv { - results.push(batch.create_fst()); - } - rsend.send(results).unwrap(); - }); - } - Sorters { send: bsend, results: rrecv } - } - - fn create_fst(&self, batch: B) { - self.send.send(batch).unwrap(); - } - - fn results(self) -> Vec> { - drop(self.send); - let mut results = vec![]; - for rs in self.results { - results.extend(rs); - } - results - } -} - -trait Batchable { - fn create_fst(&mut self) -> Result; -} - -struct KvBatch { - tmp_dir: Arc, - index: usize, - kvs: Vec, -} - -impl Batchable for KvBatch { - fn create_fst(&mut self) -> Result { - self.kvs.sort(); - self.kvs.dedup(); - let file_name = format!("batch{}", self.index); - let path = self.tmp_dir.join(file_name).to_path_buf(); - let wtr = io::BufWriter::new(File::create(&path)?); - let mut builder = raw::Builder::new(wtr)?; - for &(ref k, v) in &self.kvs { - match builder.insert(k, v) { - Ok(_) => {} - Err(fst::Error::Fst(fst::raw::Error::DuplicateKey { - .. - })) => {} - Err(err) => return Err(From::from(err)), - } - } - builder.finish()?; - Ok(path) - } -} - -struct UnionBatch { - tmp_dir: Arc, - gen: usize, - index: usize, - fsts: Vec, - value_merger: Option u64 + Send + Sync + 'static>>, -} - -impl Batchable for UnionBatch { - fn create_fst(&mut self) -> Result { - let file_name = format!("union-gen{}-batch{}", self.gen, self.index); - let path = self.tmp_dir.join(file_name).to_path_buf(); - let wtr = io::BufWriter::new(File::create(&path)?); - let mut builder = raw::Builder::new(wtr)?; - - let mut fsts = vec![]; - for path in &self.fsts { - fsts.push(unsafe { util::mmap_fst(path)? }); - } - let mut union = fsts.iter().collect::().union(); - while let Some((key, outputs)) = union.next() { - let mut merged = 0; - if let Some(ref value_merger) = self.value_merger { - merged = outputs - .iter() - .fold(0, |merged, iv| value_merger(merged, iv.value)); - } - builder.insert(key, merged)?; - } - builder.finish()?; - Ok(path) - } -} diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs deleted file mode 100644 index 2cc43153..00000000 --- a/fst-bin/src/util.rs +++ /dev/null @@ -1,188 +0,0 @@ -use std::ascii; -use std::fs::File; -use std::io; -use std::path::{Path, PathBuf}; - -use bstr::{io::BufReadExt, BString}; -use csv; -use fst::raw::{Fst, Output}; -use fst::{IntoStreamer, Streamer}; -use memmap2::Mmap; - -use crate::Error; - -pub unsafe fn mmap_fst>(path: P) -> Result, Error> { - let mmap = Mmap::map(&File::open(path)?)?; - let fst = Fst::new(mmap)?; - Ok(fst) -} - -pub fn escape_input(b: u8) -> String { - String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() -} - -pub fn get_buf_reader>( - path: Option, -) -> io::Result>> { - Ok(io::BufReader::new(get_reader(path)?)) -} - -pub fn get_buf_writer>( - path: Option, -) -> io::Result>> { - Ok(io::BufWriter::new(get_writer(path)?)) -} - -pub fn get_reader>( - path: Option, -) -> io::Result> { - Ok(match to_stdio(path) { - None => Box::new(io::stdin()), - Some(path) => Box::new(File::open(path)?), - }) -} - -pub fn get_writer>( - path: Option, -) -> io::Result> { - Ok(match to_stdio(path) { - None => Box::new(io::stdout()), - Some(path) => Box::new(File::create(path)?), - }) -} - -fn to_stdio>(path: Option) -> Option { - match path { - None => None, - Some(s) => { - if s.as_ref().to_string_lossy() == "-" { - None - } else { - Some(s.as_ref().to_path_buf()) - } - } - } -} - -pub fn print_stream<'f, W, I, S>( - mut wtr: W, - outputs: bool, - stream: I, -) -> Result<(), Error> -where - W: io::Write, - I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, -{ - let mut stream = stream.into_stream(); - if outputs { - let mut wtr = csv::Writer::from_writer(wtr); - while let Some((k, v)) = stream.next() { - let v = v.value().to_string(); - wtr.write_record((&[k, v.as_bytes()]).iter())?; - } - wtr.flush().map_err(From::from) - } else { - while let Some((k, _)) = stream.next() { - wtr.write_all(k)?; - wtr.write_all(b"\n")?; - } - wtr.flush().map_err(From::from) - } -} - -pub struct ConcatLines { - inputs: Vec, - cur: Option, -} - -type Lines = bstr::io::ByteLines< - io::BufReader>, ->; - -impl ConcatLines { - pub fn new(mut inputs: Vec) -> ConcatLines { - inputs.reverse(); // treat it as a stack - ConcatLines { inputs, cur: None } - } -} - -impl Iterator for ConcatLines { - type Item = io::Result; - - fn next(&mut self) -> Option> { - loop { - if self.cur.is_none() { - match self.inputs.pop() { - None => return None, - Some(path) => { - let rdr = match get_buf_reader(Some(path)) { - Err(err) => return Some(Err(err)), - Ok(rdr) => rdr, - }; - self.cur = Some(rdr.byte_lines()); - } - } - } - match self.cur.as_mut().and_then(|lines| lines.next()) { - None => self.cur = None, - Some(r) => return Some(r.map(BString::from)), - } - } - } -} - -pub struct ConcatCsv { - inputs: Vec, - cur: Option, -} - -type Reader = Box; -type Rows = csv::DeserializeRecordsIntoIter; - -impl ConcatCsv { - pub fn new(mut inputs: Vec) -> ConcatCsv { - inputs.reverse(); // treat it as a stack - ConcatCsv { inputs, cur: None } - } - - fn read_row(&mut self) -> Option> { - let rdr = match self.cur { - None => return None, - Some(ref mut rdr) => rdr, - }; - match rdr.next() { - Some(Ok((k, v))) => Some(Ok((k, v))), - Some(Err(err)) => Some(Err(From::from(err))), - None => None, - } - } -} - -impl Iterator for ConcatCsv { - type Item = Result<(BString, u64), Error>; - - fn next(&mut self) -> Option> { - loop { - if self.cur.is_none() { - match self.inputs.pop() { - None => return None, - Some(path) => { - let rdr = match get_reader(Some(path)) { - Err(err) => return Some(Err(From::from(err))), - Ok(rdr) => rdr, - }; - let csvrdr = csv::ReaderBuilder::new() - .has_headers(false) - .from_reader(rdr); - self.cur = Some(csvrdr.into_deserialize()); - } - } - } - match self.read_row() { - None => self.cur = None, - Some(r) => return Some(r), - } - } - } -} From 7ef49f53833117fadec311c35faf3ce17abe7bee Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:17:24 +0100 Subject: [PATCH 08/28] Delete fst-levenshtein directory --- fst-levenshtein/Cargo.toml | 17 -- fst-levenshtein/src/error.rs | 39 ----- fst-levenshtein/src/lib.rs | 315 ----------------------------------- 3 files changed, 371 deletions(-) delete mode 100644 fst-levenshtein/Cargo.toml delete mode 100644 fst-levenshtein/src/error.rs delete mode 100644 fst-levenshtein/src/lib.rs diff --git a/fst-levenshtein/Cargo.toml b/fst-levenshtein/Cargo.toml deleted file mode 100644 index 868e0dec..00000000 --- a/fst-levenshtein/Cargo.toml +++ /dev/null @@ -1,17 +0,0 @@ -[package] -name = "fst-levenshtein" -version = "0.3.0" #:version -authors = ["Andrew Gallant "] -description = """ -DEPRECATED. Use 'fst' crate with 'levenshtein' feature instead. -""" -documentation = "https://docs.rs/fst-levenshtein" -homepage = "https://github.com/BurntSushi/fst" -repository = "https://github.com/BurntSushi/fst" -keywords = ["search", "information", "retrieval", "dictionary", "map"] -license = "Unlicense/MIT" -edition = "2018" - -[dependencies] -fst = "0.3.1" -utf8-ranges = "1" diff --git a/fst-levenshtein/src/error.rs b/fst-levenshtein/src/error.rs deleted file mode 100644 index 24322bd3..00000000 --- a/fst-levenshtein/src/error.rs +++ /dev/null @@ -1,39 +0,0 @@ -use std::error; -use std::fmt; - -/// An error that occurred while building a Levenshtein automaton. -#[derive(Debug)] -pub enum Error { - /// If construction of the automaton reaches some hard-coded limit - /// on the number of states, then this error is returned. - /// - /// The number given is the limit that was exceeded. - TooManyStates(usize), -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use self::Error::*; - match *self { - TooManyStates(size_limit) => write!( - f, - "Levenshtein automaton exceeds size limit of \ - {} states", - size_limit - ), - } - } -} - -impl error::Error for Error { - fn description(&self) -> &str { - use self::Error::*; - match *self { - TooManyStates(_) => "levenshtein automaton has too many states", - } - } - - fn cause(&self) -> Option<&dyn error::Error> { - None - } -} diff --git a/fst-levenshtein/src/lib.rs b/fst-levenshtein/src/lib.rs deleted file mode 100644 index e5c65f97..00000000 --- a/fst-levenshtein/src/lib.rs +++ /dev/null @@ -1,315 +0,0 @@ -use std::cmp; -use std::collections::hash_map::Entry; -use std::collections::{HashMap, HashSet}; -use std::fmt; - -use utf8_ranges::{Utf8Range, Utf8Sequences}; - -use fst::automaton::Automaton; - -pub use self::error::Error; - -mod error; - -const STATE_LIMIT: usize = 10_000; // currently at least 20MB >_< - -/// A Unicode aware Levenshtein automaton for running efficient fuzzy queries. -/// -/// A Levenshtein automata is one way to search any finite state transducer -/// for keys that *approximately* match a given query. A Levenshtein automaton -/// approximates this by returning all keys within a certain edit distance of -/// the query. The edit distance is defined by the number of insertions, -/// deletions and substitutions required to turn the query into the key. -/// Insertions, deletions and substitutions are based on -/// **Unicode characters** (where each character is a single Unicode scalar -/// value). -/// -/// # Example -/// -/// This example shows how to find all keys within an edit distance of `1` -/// from `foo`. -/// -/// ```rust -/// extern crate fst; -/// extern crate fst_levenshtein; -/// -/// use fst::{IntoStreamer, Streamer, Set}; -/// use fst_levenshtein::Levenshtein; -/// -/// fn main() { -/// let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; -/// let set = Set::from_iter(keys).unwrap(); -/// -/// let lev = Levenshtein::new("foo", 1).unwrap(); -/// let mut stream = set.search(&lev).into_stream(); -/// -/// let mut keys = vec![]; -/// while let Some(key) = stream.next() { -/// keys.push(key.to_vec()); -/// } -/// assert_eq!(keys, vec![ -/// "fo".as_bytes(), // 1 deletion -/// "fob".as_bytes(), // 1 substitution -/// "foo".as_bytes(), // 0 insertions/deletions/substitutions -/// "food".as_bytes(), // 1 insertion -/// ]); -/// } -/// ``` -/// -/// This example only uses ASCII characters, but it will work equally well -/// on Unicode characters. -/// -/// # Warning: experimental -/// -/// While executing this Levenshtein automaton against a finite state -/// transducer will be very fast, *constructing* an automaton may not be. -/// Namely, this implementation is a proof of concept. While I believe the -/// algorithmic complexity is not exponential, the implementation is not speedy -/// and it can use enormous amounts of memory (tens of MB before a hard-coded -/// limit will cause an error to be returned). -/// -/// This is important functionality, so one should count on this implementation -/// being vastly improved in the future. -pub struct Levenshtein { - prog: DynamicLevenshtein, - dfa: Dfa, -} - -impl Levenshtein { - /// Create a new Levenshtein query. - /// - /// The query finds all matching terms that are at most `distance` - /// edit operations from `query`. (An edit operation may be an insertion, - /// a deletion or a substitution.) - /// - /// If the underlying automaton becomes too big, then an error is returned. - /// - /// A `Levenshtein` value satisfies the `Automaton` trait, which means it - /// can be used with the `search` method of any finite state transducer. - #[inline] - pub fn new(query: &str, distance: u32) -> Result { - let lev = DynamicLevenshtein { - query: query.to_owned(), - dist: distance as usize, - }; - let dfa = DfaBuilder::new(lev.clone()).build()?; - Ok(Levenshtein { prog: lev, dfa }) - } -} - -impl fmt::Debug for Levenshtein { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "Levenshtein(query: {:?}, distance: {:?})", - self.prog.query, self.prog.dist - ) - } -} - -#[derive(Clone)] -struct DynamicLevenshtein { - query: String, - dist: usize, -} - -impl DynamicLevenshtein { - fn start(&self) -> Vec { - (0..self.query.chars().count() + 1).collect() - } - - fn is_match(&self, state: &[usize]) -> bool { - state.last().map(|&n| n <= self.dist).unwrap_or(false) - } - - fn can_match(&self, state: &[usize]) -> bool { - state.iter().min().map(|&n| n <= self.dist).unwrap_or(false) - } - - fn accept(&self, state: &[usize], chr: Option) -> Vec { - let mut next = vec![state[0] + 1]; - for (i, c) in self.query.chars().enumerate() { - let cost = if Some(c) == chr { 0 } else { 1 }; - let v = cmp::min( - cmp::min(next[i] + 1, state[i + 1] + 1), - state[i] + cost, - ); - next.push(cmp::min(v, self.dist + 1)); - } - next - } -} - -impl Automaton for Levenshtein { - type State = Option; - - #[inline] - fn start(&self) -> Option { - Some(0) - } - - #[inline] - fn is_match(&self, state: &Option) -> bool { - state.map(|state| self.dfa.states[state].is_match).unwrap_or(false) - } - - #[inline] - fn can_match(&self, state: &Option) -> bool { - state.is_some() - } - - #[inline] - fn accept(&self, state: &Option, byte: u8) -> Option { - state.and_then(|state| self.dfa.states[state].next[byte as usize]) - } -} - -#[derive(Debug)] -pub struct Dfa { - states: Vec, -} - -struct State { - next: [Option; 256], - is_match: bool, -} - -impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "State {{")?; - writeln!(f, " is_match: {:?}", self.is_match)?; - for i in 0..256 { - if let Some(si) = self.next[i] { - writeln!(f, " {:?}: {:?}", i, si)?; - } - } - write!(f, "}}") - } -} - -struct DfaBuilder { - dfa: Dfa, - lev: DynamicLevenshtein, - cache: HashMap, usize>, -} - -impl DfaBuilder { - fn new(lev: DynamicLevenshtein) -> DfaBuilder { - DfaBuilder { - dfa: Dfa { states: Vec::with_capacity(16) }, - lev, - cache: HashMap::with_capacity(1024), - } - } - - fn build(mut self) -> Result { - let mut stack = vec![self.lev.start()]; - let mut seen = HashSet::new(); - let query = self.lev.query.clone(); // temp work around of borrowck - while let Some(lev_state) = stack.pop() { - let dfa_si = self.cached_state(&lev_state).unwrap(); - let mismatch = self.add_mismatch_utf8_states(dfa_si, &lev_state); - if let Some((next_si, lev_next)) = mismatch { - if !seen.contains(&next_si) { - seen.insert(next_si); - stack.push(lev_next); - } - } - for (i, c) in query.chars().enumerate() { - if lev_state[i] > self.lev.dist { - continue; - } - let lev_next = self.lev.accept(&lev_state, Some(c)); - let next_si = self.cached_state(&lev_next); - if let Some(next_si) = next_si { - self.add_utf8_sequences(true, dfa_si, next_si, c, c); - if !seen.contains(&next_si) { - seen.insert(next_si); - stack.push(lev_next); - } - } - } - if self.dfa.states.len() > STATE_LIMIT { - return Err(Error::TooManyStates(STATE_LIMIT)); - } - } - Ok(self.dfa) - } - - fn cached_state(&mut self, lev_state: &[usize]) -> Option { - self.cached(lev_state).map(|(si, _)| si) - } - - fn cached(&mut self, lev_state: &[usize]) -> Option<(usize, bool)> { - if !self.lev.can_match(lev_state) { - return None; - } - Some(match self.cache.entry(lev_state.to_vec()) { - Entry::Occupied(v) => (*v.get(), true), - Entry::Vacant(v) => { - let is_match = self.lev.is_match(lev_state); - self.dfa.states.push(State { next: [None; 256], is_match }); - (*v.insert(self.dfa.states.len() - 1), false) - } - }) - } - - fn add_mismatch_utf8_states( - &mut self, - from_si: usize, - lev_state: &[usize], - ) -> Option<(usize, Vec)> { - let mismatch_state = self.lev.accept(lev_state, None); - let to_si = match self.cached(&mismatch_state) { - None => return None, - Some((si, _)) => si, - // Some((si, true)) => return Some((si, mismatch_state)), - // Some((si, false)) => si, - }; - self.add_utf8_sequences(false, from_si, to_si, '\u{0}', '\u{10FFFF}'); - return Some((to_si, mismatch_state)); - } - - fn add_utf8_sequences( - &mut self, - overwrite: bool, - from_si: usize, - to_si: usize, - from_chr: char, - to_chr: char, - ) { - for seq in Utf8Sequences::new(from_chr, to_chr) { - let mut fsi = from_si; - for range in &seq.as_slice()[0..seq.len() - 1] { - let tsi = self.new_state(false); - self.add_utf8_range(overwrite, fsi, tsi, range); - fsi = tsi; - } - self.add_utf8_range( - overwrite, - fsi, - to_si, - &seq.as_slice()[seq.len() - 1], - ); - } - } - - fn add_utf8_range( - &mut self, - overwrite: bool, - from: usize, - to: usize, - range: &Utf8Range, - ) { - for b in range.start as usize..range.end as usize + 1 { - if overwrite || self.dfa.states[from].next[b].is_none() { - self.dfa.states[from].next[b] = Some(to); - } - } - } - - fn new_state(&mut self, is_match: bool) -> usize { - self.dfa.states.push(State { next: [None; 256], is_match }); - self.dfa.states.len() - 1 - } -} From 3df9bb09574f4802590b16fb26718276f18e2a71 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:17:40 +0100 Subject: [PATCH 09/28] Delete fst-regex directory --- fst-regex/Cargo.toml | 18 --- fst-regex/src/compile.rs | 274 --------------------------------------- fst-regex/src/dfa.rs | 176 ------------------------- fst-regex/src/error.rs | 92 ------------- fst-regex/src/lib.rs | 176 ------------------------- fst-regex/src/sparse.rs | 36 ----- 6 files changed, 772 deletions(-) delete mode 100644 fst-regex/Cargo.toml delete mode 100644 fst-regex/src/compile.rs delete mode 100644 fst-regex/src/dfa.rs delete mode 100644 fst-regex/src/error.rs delete mode 100644 fst-regex/src/lib.rs delete mode 100644 fst-regex/src/sparse.rs diff --git a/fst-regex/Cargo.toml b/fst-regex/Cargo.toml deleted file mode 100644 index 3556b3d2..00000000 --- a/fst-regex/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "fst-regex" -version = "0.3.0" #:version -authors = ["Andrew Gallant "] -description = """ -DEPRECATED. Use 'regex-automata' crate with 'transducer' feature instead. -""" -documentation = "https://docs.rs/fst-regex" -homepage = "https://github.com/BurntSushi/fst" -repository = "https://github.com/BurntSushi/fst" -keywords = ["search", "information", "retrieval", "dictionary", "map"] -license = "Unlicense/MIT" -edition = "2018" - -[dependencies] -fst = { version = "0.3.1", default-features = false } -regex-syntax = "0.3" -utf8-ranges = "1" diff --git a/fst-regex/src/compile.rs b/fst-regex/src/compile.rs deleted file mode 100644 index 92136684..00000000 --- a/fst-regex/src/compile.rs +++ /dev/null @@ -1,274 +0,0 @@ -use regex_syntax::{CharClass, ClassRange, Expr, Repeater}; -use utf8_ranges::{Utf8Sequence, Utf8Sequences}; - -use crate::{Error, Inst}; - -pub struct Compiler { - size_limit: usize, - insts: Vec, -} - -impl Compiler { - pub fn new(size_limit: usize) -> Compiler { - Compiler { size_limit: size_limit, insts: vec![] } - } - - pub fn compile(mut self, ast: &Expr) -> Result, Error> { - self.c(ast)?; - self.insts.push(Inst::Match); - Ok(self.insts) - } - - fn c(&mut self, ast: &Expr) -> Result<(), Error> { - match *ast { - Expr::StartLine - | Expr::EndLine - | Expr::StartText - | Expr::EndText => return Err(From::from(Error::NoEmpty)), - Expr::WordBoundary - | Expr::NotWordBoundary - | Expr::WordBoundaryAscii - | Expr::NotWordBoundaryAscii => { - return Err(From::from(Error::NoWordBoundary)); - } - Expr::LiteralBytes { .. } - | Expr::AnyByte - | Expr::AnyByteNoNL - | Expr::ClassBytes(..) => { - return Err(From::from(Error::NoBytes)); - } - Expr::Empty => {} - Expr::Literal { ref chars, casei } => { - for &c in chars { - if casei { - self.c(&Expr::Class( - CharClass::new(vec![ClassRange { - start: c, - end: c, - }]) - .case_fold(), - ))?; - } else { - // One scalar value, so we're guaranteed to get a - // single byte sequence. - for seq in Utf8Sequences::new(c, c) { - self.compile_utf8_ranges(&seq); - } - } - } - } - Expr::AnyChar => { - self.c(&Expr::Class(CharClass::new(vec![ClassRange { - start: '\u{0}', - end: '\u{10FFFF}', - }])))? - } - Expr::AnyCharNoNL => { - self.c(&Expr::Class(CharClass::new(vec![ - ClassRange { start: '\u{0}', end: '\u{09}' }, - ClassRange { start: '\u{0B}', end: '\u{10FFFF}' }, - ])))? - } - Expr::Class(ref cls) => { - self.compile_class(cls)?; - } - Expr::Group { ref e, .. } => self.c(e)?, - Expr::Concat(ref es) => { - for e in es { - self.c(e)?; - } - } - Expr::Alternate(ref es) => { - if es.len() == 0 { - return Ok(()); - } - let mut jmps_to_end = vec![]; - for e in &es[0..es.len() - 1] { - let split = self.empty_split(); - let j1 = self.insts.len(); - self.c(e)?; - jmps_to_end.push(self.empty_jump()); - let j2 = self.insts.len(); - self.set_split(split, j1, j2); - } - self.c(&es[es.len() - 1])?; - let end = self.insts.len(); - for jmp_to_end in jmps_to_end { - self.set_jump(jmp_to_end, end); - } - } - Expr::Repeat { greedy: false, .. } => { - return Err(Error::NoLazy.into()); - } - Expr::Repeat { ref e, r: Repeater::ZeroOrOne, .. } => { - let split = self.empty_split(); - let j1 = self.insts.len(); - self.c(e)?; - let j2 = self.insts.len(); - self.set_split(split, j1, j2); - } - Expr::Repeat { ref e, r: Repeater::ZeroOrMore, .. } => { - let j1 = self.insts.len(); - let split = self.empty_split(); - let j2 = self.insts.len(); - self.c(e)?; - let jmp = self.empty_jump(); - let j3 = self.insts.len(); - - self.set_jump(jmp, j1); - self.set_split(split, j2, j3); - } - Expr::Repeat { ref e, r: Repeater::OneOrMore, .. } => { - let j1 = self.insts.len(); - self.c(e)?; - let split = self.empty_split(); - let j2 = self.insts.len(); - self.set_split(split, j1, j2); - } - Expr::Repeat { - ref e, - r: Repeater::Range { min, max: None }, - .. - } => { - for _ in 0..min { - self.c(e)?; - } - self.c(&Expr::Repeat { - e: e.clone(), - r: Repeater::ZeroOrMore, - greedy: true, - })?; - } - Expr::Repeat { - ref e, - r: Repeater::Range { min, max: Some(max) }, - .. - } => { - for _ in 0..min { - self.c(e)?; - } - let (mut splits, mut starts) = (vec![], vec![]); - for _ in min..max { - splits.push(self.empty_split()); - starts.push(self.insts.len()); - self.c(e)?; - } - let end = self.insts.len(); - for (split, start) in splits.into_iter().zip(starts) { - self.set_split(split, start, end); - } - } - } - self.check_size() - } - - fn compile_class(&mut self, class: &CharClass) -> Result<(), Error> { - if class.is_empty() { - return Ok(()); - } - let mut jmps = vec![]; - for r in &class[0..class.len() - 1] { - let split = self.empty_split(); - let j1 = self.insts.len(); - self.compile_class_range(r)?; - jmps.push(self.empty_jump()); - let j2 = self.insts.len(); - self.set_split(split, j1, j2); - } - self.compile_class_range(&class[class.len() - 1])?; - let end = self.insts.len(); - for jmp in jmps { - self.set_jump(jmp, end); - } - Ok(()) - } - - fn compile_class_range( - &mut self, - char_range: &ClassRange, - ) -> Result<(), Error> { - let mut it = - Utf8Sequences::new(char_range.start, char_range.end).peekable(); - let mut jmps = vec![]; - let mut utf8_ranges = it.next().expect("non-empty char class"); - while it.peek().is_some() { - let split = self.empty_split(); - let j1 = self.insts.len(); - self.compile_utf8_ranges(&utf8_ranges); - jmps.push(self.empty_jump()); - let j2 = self.insts.len(); - self.set_split(split, j1, j2); - utf8_ranges = it.next().unwrap(); // because peek says so - } - self.compile_utf8_ranges(&utf8_ranges); - let end = self.insts.len(); - for jmp in jmps { - self.set_jump(jmp, end); - } - Ok(()) - } - - fn compile_utf8_ranges(&mut self, utf8_ranges: &Utf8Sequence) { - for r in utf8_ranges { - self.push(Inst::Range(r.start, r.end)); - } - } - - fn check_size(&self) -> Result<(), Error> { - use std::mem::size_of; - - if self.insts.len() * size_of::() > self.size_limit { - Err(Error::CompiledTooBig(self.size_limit).into()) - } else { - Ok(()) - } - } - - /// Appends the given instruction to the program. - #[inline] - fn push(&mut self, x: Inst) { - self.insts.push(x) - } - - /// Appends an *empty* `Split` instruction to the program and returns - /// the index of that instruction. (The index can then be used to "patch" - /// the actual locations of the split in later.) - #[inline] - fn empty_split(&mut self) -> usize { - self.insts.push(Inst::Split(0, 0)); - self.insts.len() - 1 - } - - /// Sets the left and right locations of a `Split` instruction at index - /// `i` to `pc1` and `pc2`, respectively. - /// If the instruction at index `i` isn't a `Split` instruction, then - /// `panic!` is called. - #[inline] - fn set_split(&mut self, i: usize, pc1: usize, pc2: usize) { - let split = &mut self.insts[i]; - match *split { - Inst::Split(_, _) => *split = Inst::Split(pc1, pc2), - _ => panic!("BUG: Invalid split index."), - } - } - - /// Appends an *empty* `Jump` instruction to the program and returns the - /// index of that instruction. - #[inline] - fn empty_jump(&mut self) -> usize { - self.insts.push(Inst::Jump(0)); - self.insts.len() - 1 - } - - /// Sets the location of a `Jump` instruction at index `i` to `pc`. - /// If the instruction at index `i` isn't a `Jump` instruction, then - /// `panic!` is called. - #[inline] - fn set_jump(&mut self, i: usize, pc: usize) { - let jmp = &mut self.insts[i]; - match *jmp { - Inst::Jump(_) => *jmp = Inst::Jump(pc), - _ => panic!("BUG: Invalid jump index."), - } - } -} diff --git a/fst-regex/src/dfa.rs b/fst-regex/src/dfa.rs deleted file mode 100644 index b9755ef8..00000000 --- a/fst-regex/src/dfa.rs +++ /dev/null @@ -1,176 +0,0 @@ -use std::collections::{HashMap, HashSet}; -use std::fmt; - -use crate::sparse::SparseSet; -use crate::{Error, Inst}; - -const STATE_LIMIT: usize = 1_000; // currently at least 2MB >_< - -pub struct DfaBuilder { - dfa: Dfa, - cache: HashMap, usize>, -} - -pub struct Dfa { - insts: Vec, - states: Vec, -} - -struct State { - insts: Vec, - next: [Option; 256], - is_match: bool, -} - -impl DfaBuilder { - pub fn new(insts: Vec) -> Self { - DfaBuilder { - dfa: Dfa { insts: insts, states: Vec::with_capacity(16) }, - cache: HashMap::with_capacity(1024), - } - } - - pub fn build(mut self) -> Result { - let mut cur = SparseSet::new(self.dfa.insts.len()); - let mut next = SparseSet::new(self.dfa.insts.len()); - - self.dfa.add(&mut cur, 0); - let mut states = vec![self.cached_state(&cur).unwrap()]; - let mut seen = HashSet::new(); - while let Some(s) = states.pop() { - for b in 0..256 { - let ns = self.run_state(&mut cur, &mut next, s, b as u8); - if let Some(ns) = ns { - if !seen.contains(&ns) { - seen.insert(ns); - states.push(ns); - } - } - if self.dfa.states.len() > STATE_LIMIT { - return Err(Error::TooManyStates(STATE_LIMIT).into()); - } - } - } - Ok(self.dfa) - } - - fn run_state( - &mut self, - cur: &mut SparseSet, - next: &mut SparseSet, - state: usize, - byte: u8, - ) -> Option { - cur.clear(); - for &ip in &self.dfa.states[state].insts { - cur.add(ip); - } - self.dfa.run(cur, next, byte); - let next_state = self.cached_state(next); - self.dfa.states[state].next[byte as usize] = next_state; - next_state - } - - fn cached_state(&mut self, set: &SparseSet) -> Option { - use std::collections::hash_map::Entry; - use crate::Inst::*; - - // There are probably many ways to optimize this routine. ---AG - - let mut insts = vec![]; - let mut is_match = false; - for i in 0..set.len() { - let ip = set.get(i); - match self.dfa.insts[ip] { - Jump(_) | Split(_, _) => {} - Range(_, _) => insts.push(ip), - Match => { - is_match = true; - insts.push(ip); - } - } - } - if insts.len() == 0 { - return None; - } - Some(match self.cache.entry(insts.clone()) { - Entry::Occupied(v) => *v.get(), - Entry::Vacant(v) => { - self.dfa.states.push(State { - insts: insts, - next: [None; 256], - is_match: is_match, - }); - *v.insert(self.dfa.states.len() - 1) - } - }) - } -} - -impl Dfa { - pub fn is_match(&self, si: usize) -> bool { - self.states[si].is_match - } - - pub fn accept(&self, si: usize, byte: u8) -> Option { - self.states[si].next[byte as usize] - } - - fn add(&self, set: &mut SparseSet, ip: usize) { - use crate::Inst::*; - - if set.contains(ip) { - return; - } - set.add(ip); - match self.insts[ip] { - Match | Range(_, _) => {} - Jump(ip) => self.add(set, ip), - Split(ip1, ip2) => { - self.add(set, ip1); - self.add(set, ip2); - } - } - } - - fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool { - use crate::Inst::*; - to.clear(); - let mut is_match = false; - for i in 0..from.len() { - let ip = from.get(i); - match self.insts[ip] { - Jump(_) | Split(_, _) => {} - Match => is_match = true, - Range(s, e) => { - if s <= byte && byte <= e { - self.add(to, ip + 1); - } - } - } - } - is_match - } -} - -impl fmt::Debug for Dfa { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (i, inst) in self.insts.iter().enumerate() { - writeln!(f, "{:03} {:?}", i, inst)?; - } - writeln!(f, "------------")?; - for (i, state) in self.states.iter().enumerate() { - if state.is_match { - writeln!(f, "{:03}* {:?}", i, state.insts)?; - } else { - writeln!(f, "{:03} {:?}", i, state.insts)?; - } - for j in 0..256 { - if let Some(si) = state.next[j] { - writeln!(f, "{:03} {:X} => {}", i, j, si)?; - } - } - } - Ok(()) - } -} diff --git a/fst-regex/src/error.rs b/fst-regex/src/error.rs deleted file mode 100644 index c757be84..00000000 --- a/fst-regex/src/error.rs +++ /dev/null @@ -1,92 +0,0 @@ -use std::error; -use std::fmt; - -use regex_syntax; - -/// An error that occurred while compiling a regular expression. -#[derive(Debug)] -pub enum Error { - /// A problem with the syntax of a regular expression. - Syntax(regex_syntax::Error), - /// Too many instructions resulting from the regular expression. - /// - /// The number given is the limit that was exceeded. - CompiledTooBig(usize), - /// Too many automata states resulting from the regular expression. - /// - /// This is distinct from `CompiledTooBig` because `TooManyStates` refers - /// to the DFA construction where as `CompiledTooBig` refers to the NFA - /// construction. - /// - /// The number given is the limit that was exceeded. - TooManyStates(usize), - /// Lazy quantifiers are not allowed (because they have no useful - /// interpretation when used purely for automata intersection, as is the - /// case in this crate). - NoLazy, - /// Word boundaries are currently not allowed. - /// - /// This restriction may be lifted in the future. - NoWordBoundary, - /// Empty or "zero width assertions" such as `^` or `$` are currently - /// not allowed. - /// - /// This restriction may be lifted in the future. - NoEmpty, - /// Byte literals such as `(?-u:\xff)` are not allowed. - /// - /// This restriction may be lifted in the future. - NoBytes, -} - -impl From for Error { - #[inline] - fn from(err: regex_syntax::Error) -> Error { - Error::Syntax(err) - } -} - -impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use self::Error::*; - match *self { - Syntax(ref err) => err.fmt(f), - CompiledTooBig(size_limit) => write!( - f, - "Compiled regex exceeds size limit of {} bytes", - size_limit - ), - TooManyStates(size_limit) => write!( - f, - "Compiled regex exceeds size limit of {} states", - size_limit - ), - NoLazy => write!( - f, - "Lazy reptition operators such as '+?' are \ - not allowed." - ), - NoWordBoundary => write!( - f, - "Word boundary operators are not \ - allowed." - ), - NoEmpty => write!( - f, - "Empty match operators are not allowed \ - (hopefully temporary)." - ), - NoBytes => write!(f, "Byte literals are not allowed."), - } - } -} - -impl error::Error for Error { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - use self::Error::*; - match *self { - Syntax(ref err) => Some(err), - _ => None, - } - } -} diff --git a/fst-regex/src/lib.rs b/fst-regex/src/lib.rs deleted file mode 100644 index d24b84ec..00000000 --- a/fst-regex/src/lib.rs +++ /dev/null @@ -1,176 +0,0 @@ - -use regex_syntax; - - -use std::fmt; - -use fst::Automaton; - -pub use crate::error::Error; - -mod compile; -mod dfa; -mod error; -mod sparse; - -/// A regular expression for searching FSTs with Unicode support. -/// -/// Regular expressions are compiled down to a deterministic finite automaton -/// that can efficiently search any finite state transducer. Notably, most -/// regular expressions only need to explore a small portion of a finite state -/// transducer without loading all of it into memory. -/// -/// # Syntax -/// -/// `Regex` supports fully featured regular expressions. Namely, it supports -/// all of the same constructs as the standard `regex` crate except for the -/// following things: -/// -/// 1. Lazy quantifiers, since a regular expression automaton only reports -/// whether a key matches at all, and not its location. Namely, lazy -/// quantifiers such as `+?` only modify the location of a match, but never -/// change a non-match into a match or a match into a non-match. -/// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in -/// a deterministic finite automaton, but not impossible. As such, these -/// may be allowed some day. -/// 3. Other zero width assertions like `^` and `$`. These are easier to -/// support than word boundaries, but are still tricky and usually aren't -/// as useful when searching dictionaries. -/// -/// Otherwise, the [full syntax of the `regex` -/// crate](https://docs.rs/regex/*/regex/#syntax) -/// is supported. This includes all Unicode support and relevant flags. -/// (The `U` and `m` flags are no-ops because of (1) and (3) above, -/// respectively.) -/// -/// # Matching semantics -/// -/// A regular expression matches a key in a finite state transducer if and only -/// if it matches from the start of a key all the way to end. Stated -/// differently, every regular expression `(re)` is matched as if it were -/// `^(re)$`. This means that if you want to do a substring match, then you -/// must use `.*substring.*`. -/// -/// **Caution**: Starting a regular expression with `.*` means that it could -/// potentially match *any* key in a finite state transducer. This implies that -/// all keys could be visited, which could be slow. It is possible that this -/// crate will grow facilities for detecting regular expressions that will -/// scan a large portion of a transducer and optionally disallow them. -/// -/// # Example -/// -/// This example shows how to run a regular expression on a `Set`. -/// -/// ```rust -/// extern crate fst; -/// extern crate fst_regex; -/// -/// use fst::{IntoStreamer, Streamer, Set}; -/// use fst_regex::Regex; -/// -/// fn main() { -/// let set = Set::from_iter(&["foo", "foo1", "foo2", "foo3", "foobar"]) -/// .unwrap(); -/// -/// let re = Regex::new("f[a-z]+3?").unwrap(); -/// let mut stream = set.search(&re).into_stream(); -/// -/// let mut keys = vec![]; -/// while let Some(key) = stream.next() { -/// keys.push(key.to_vec()); -/// } -/// assert_eq!(keys, vec![ -/// "foo".as_bytes(), "foo3".as_bytes(), "foobar".as_bytes(), -/// ]); -/// } -/// ``` -/// -/// # Warning: experimental -/// -/// While executing a regular expression against a finite state transducer will -/// be very fast, *construction* of a regular expression automaton may not be. -/// Namely, this implementation is a proof of concept. In particular, one of -/// its major deficiencies is that it can use enormous amounts of memory. -/// Note though, that the construction phase will return an error if the -/// underlying automata grows too big (tens of MB). -/// -/// This is important functionality, so one should count on this implementation -/// being vastly improved in the future. -pub struct Regex { - original: String, - dfa: dfa::Dfa, -} - -#[derive(Eq, PartialEq)] -pub enum Inst { - Match, - Jump(usize), - Split(usize, usize), - Range(u8, u8), -} - -impl Regex { - /// Create a new regular expression query. - /// - /// The query finds all terms matching the regular expression. - /// - /// If the regular expression is malformed or if it results in an automaton - /// that is too big, then an error is returned. - /// - /// A `Regex` value satisfies the `Automaton` trait, which means it can be - /// used with the `search` method of any finite state transducer. - #[inline] - pub fn new(re: &str) -> Result { - Regex::with_size_limit(10 * (1 << 20), re) - } - - fn with_size_limit(size: usize, re: &str) -> Result { - let expr = regex_syntax::Expr::parse(re)?; - let insts = compile::Compiler::new(size).compile(&expr)?; - let dfa = dfa::DfaBuilder::new(insts).build()?; - Ok(Regex { original: re.to_owned(), dfa: dfa }) - } -} - -impl Automaton for Regex { - type State = Option; - - #[inline] - fn start(&self) -> Option { - Some(0) - } - - #[inline] - fn is_match(&self, state: &Option) -> bool { - state.map(|state| self.dfa.is_match(state)).unwrap_or(false) - } - - #[inline] - fn can_match(&self, state: &Option) -> bool { - state.is_some() - } - - #[inline] - fn accept(&self, state: &Option, byte: u8) -> Option { - state.and_then(|state| self.dfa.accept(state, byte)) - } -} - -impl fmt::Debug for Regex { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "Regex({:?})", self.original)?; - self.dfa.fmt(f) - } -} - -impl fmt::Debug for Inst { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use self::Inst::*; - match *self { - Match => write!(f, "Match"), - Jump(ip) => write!(f, "JUMP {}", ip), - Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2), - Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e), - } - } -} diff --git a/fst-regex/src/sparse.rs b/fst-regex/src/sparse.rs deleted file mode 100644 index 7df3ac6e..00000000 --- a/fst-regex/src/sparse.rs +++ /dev/null @@ -1,36 +0,0 @@ -pub struct SparseSet { - dense: Vec, - sparse: Vec, - size: usize, -} - -impl SparseSet { - pub fn new(size: usize) -> SparseSet { - SparseSet { dense: vec![0; size], sparse: vec![0; size], size: 0 } - } - - pub fn len(&self) -> usize { - self.size - } - - pub fn add(&mut self, ip: usize) -> usize { - let i = self.size; - self.dense[i] = ip; - self.sparse[ip] = i; - self.size += 1; - i - } - - pub fn get(&self, i: usize) -> usize { - self.dense[i] - } - - pub fn contains(&self, ip: usize) -> bool { - let i = self.sparse[ip]; - i < self.size && self.dense[i] == ip - } - - pub fn clear(&mut self) { - self.size = 0; - } -} From 0baa27e4409754404d316c857bd7a585cb90f688 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:27:25 +0100 Subject: [PATCH 10/28] Update ci.yml --- .github/workflows/ci.yml | 218 ++++++++++++++------------------------- 1 file changed, 77 insertions(+), 141 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a782751d..15a0e4a7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,159 +1,95 @@ -name: ci +name: CI + on: - pull_request: - branches: - - master push: - branches: - - master - schedule: - - cron: '00 01 * * *' + paths: + - ".github/workflows/ci.yml" + - "**/*.rs" + - "**/Cargo.toml" + pull_request: + paths: + - ".github/workflows/ci.yml" + - "**/*.rs" + - "**/Cargo.toml" -# The section is needed to drop write-all permissions that are granted on -# `schedule` event. By specifying any permission explicitly all others are set -# to none. By using the principle of least privilege the damage a compromised -# workflow can do (because of an injection or compromised third party tool or -# action) is restricted. Currently the worklow doesn't need any additional -# permission except for pulling the code. Adding labels to issues, commenting -# on pull-requests, etc. may need additional permissions: -# -# Syntax for this section: -# https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions -# -# Reference for how to assign permissions on a job-by-job basis: -# https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs -# -# Reference for available permissions that we can enable if needed: -# https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token permissions: - # to fetch code (actions/checkout) contents: read +env: + RUSTFLAGS: -Dwarnings + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: + pre_ci: + uses: dtolnay/.github/.github/workflows/pre_ci.yml@master + test: - name: test - env: - # For some builds, we use cross to test on 32-bit and big-endian - # systems. - CARGO: cargo - # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. - # Note that we only use cross on Linux, so setting a target on a - # different OS will just use normal cargo. - TARGET: - # Bump this as appropriate. We pin to a version to make sure CI - # continues to work as cross releases in the past have broken things - # in subtle ways. - CROSS_VERSION: v0.2.5 - runs-on: ${{ matrix.os }} + name: Rust ${{matrix.rust}} + needs: pre_ci + if: needs.pre_ci.outputs.continue + runs-on: ubuntu-latest strategy: fail-fast: false matrix: - include: - - build: pinned - os: ubuntu-latest - rust: 1.60.0 - - build: stable - os: ubuntu-latest - rust: stable - - build: beta - os: ubuntu-latest - rust: beta - - build: nightly - os: ubuntu-latest - rust: nightly - - build: macos - os: macos-latest - rust: stable - - build: win-msvc - os: windows-latest - rust: stable - - build: win-gnu - os: windows-latest - rust: stable-x86_64-gnu - - build: stable-x86 - os: ubuntu-latest - rust: stable - target: i686-unknown-linux-gnu - - build: stable-aarch64 - os: ubuntu-latest - rust: stable - target: aarch64-unknown-linux-gnu - - build: stable-powerpc64 - os: ubuntu-latest - rust: stable - target: powerpc64-unknown-linux-gnu - - build: stable-s390x - os: ubuntu-latest - rust: stable - target: s390x-unknown-linux-gnu + rust: [nightly, stable] + features: ["", "--no-default-features", "--no-default-features --features alloc"] + timeout-minutes: 45 steps: - - name: Checkout repository - uses: actions/checkout@v4 - - - name: Install Rust - uses: dtolnay/rust-toolchain@master - with: - toolchain: ${{ matrix.rust }} - - - name: Use Cross - if: matrix.os == 'ubuntu-latest' && matrix.target != '' - run: | - # In the past, new releases of 'cross' have broken CI. So for now, we - # pin it. We also use their pre-compiled binary releases because cross - # has over 100 dependencies and takes a bit to compile. - dir="$RUNNER_TEMP/cross-download" - mkdir "$dir" - echo "$dir" >> $GITHUB_PATH - cd "$dir" - curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" - tar xf cross-x86_64-unknown-linux-musl.tar.gz - echo "CARGO=cross" >> $GITHUB_ENV - echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV - - - name: Show command used for Cargo - run: | - echo "cargo command is: ${{ env.CARGO }}" - echo "target flag is: ${{ env.TARGET }}" - - - name: Build (just fst crate) - if: matrix.build == 'pinned' - run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }} + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{matrix.rust}} + - uses: Swatinem/rust-cache@v2 + - run: cargo test --workspace + env: + RUSTFLAGS: ${{matrix.rustflags}} ${{env.RUSTFLAGS}} - - name: Build - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} build --verbose --all ${{ env.TARGET }} - - - name: Build docs - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} doc --verbose --all ${{ env.TARGET }} - - - name: Run tests - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} test --verbose --all ${{ env.TARGET }} - - - name: Run tests without default features - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} test --verbose --lib --no-default-features ${{ env.TARGET }} - - - name: Build fst CLI tool - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} build --verbose --manifest-path fst-bin/Cargo.toml ${{ env.TARGET }} + msrv: + name: Rust MSRV + needs: pre_ci + if: needs.pre_ci.outputs.continue + runs-on: ubuntu-latest + timeout-minutes: 45 + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@stable + with: + toolchain: 1.70.0 # MSRV + - uses: Swatinem/rust-cache@v2 + - run: cargo check --workspace --tests - - name: Compile benchmarks - if: matrix.build != 'pinned' - run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test + clippy: + name: Clippy + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' + timeout-minutes: 45 + steps: + - uses: actions/checkout@v3 + - uses: dtolnay/rust-toolchain@stable + with: + components: clippy + - uses: Swatinem/rust-cache@v2 + - run: cargo clippy --workspace -- -Dclippy::all -Dclippy::pedantic rustfmt: - name: rustfmt + name: Rustfmt + runs-on: ubuntu-latest + if: github.event_name != 'pull_request' + timeout-minutes: 45 + steps: + - uses: actions/checkout@v3 + - name: Install rustfmt with nightly toolchain + uses: dtolnay/rust-toolchain@nightly + with: + components: rustfmt + - run: cargo fmt --all -- --check + + check-licenses: runs-on: ubuntu-latest steps: - - name: Checkout repository - uses: actions/checkout@v4 - - name: Install Rust - uses: dtolnay/rust-toolchain@master - with: - toolchain: stable - components: rustfmt - - name: Check formatting - run: | - cargo fmt --all -- --check + - uses: actions/checkout@v3 + - name: Check Rust Licenses + uses: EmbarkStudios/cargo-deny-action@v1 From 13cd9cc6da03f23cf3f16727feaaef5a512e462c Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:28:05 +0100 Subject: [PATCH 11/28] Update ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 15a0e4a7..4c1d1e4c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,7 +45,7 @@ jobs: - uses: Swatinem/rust-cache@v2 - run: cargo test --workspace env: - RUSTFLAGS: ${{matrix.rustflags}} ${{env.RUSTFLAGS}} + RUSTFLAGS: ${{matrix.rustflags}} ${{env.RUSTFLAGS}} ${{matrix.features}} msrv: name: Rust MSRV From 638811ddb5b65310c133abeb540fedc15ad639f5 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:28:22 +0100 Subject: [PATCH 12/28] Update ci.yml --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4c1d1e4c..7d3566f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -43,9 +43,9 @@ jobs: with: toolchain: ${{matrix.rust}} - uses: Swatinem/rust-cache@v2 - - run: cargo test --workspace + - run: cargo test --workspace ${{matrix.features}} env: - RUSTFLAGS: ${{matrix.rustflags}} ${{env.RUSTFLAGS}} ${{matrix.features}} + RUSTFLAGS: ${{matrix.rustflags}} ${{env.RUSTFLAGS}} msrv: name: Rust MSRV From a5b4e747cd28e35df194b7ebf7d64268e49c841c Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:29:17 +0100 Subject: [PATCH 13/28] Create release-plz.yml --- .github/workflows/release-plz.yml | 32 +++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/release-plz.yml diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml new file mode 100644 index 00000000..d7958128 --- /dev/null +++ b/.github/workflows/release-plz.yml @@ -0,0 +1,32 @@ +name: Release-plz + +permissions: + pull-requests: write + contents: write + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +on: + push: + branches: + - main + +jobs: + release-plz: + name: Release-plz + runs-on: ubuntu-latest + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Run release-plz + uses: MarcoIeni/release-plz-action@v0.5 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} From 4b99b1458bfe01d01a8f32571ddd13683e7008ac Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:30:39 +0100 Subject: [PATCH 14/28] Update Cargo.toml --- Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index d9981bfd..4805d752 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,14 +1,14 @@ [package] name = "fst-no-std" version = "0.4.7" #:version -authors = ["Andrew Gallant "] +authors = ["Andrew Gallant ", "Jonas Kruckenberg "] description = """ Use finite state transducers to compactly represents sets or maps of many strings (> 1 billion is possible). """ -documentation = "https://docs.rs/fst" -homepage = "https://github.com/BurntSushi/fst" -repository = "https://github.com/BurntSushi/fst" +documentation = "https://docs.rs/fst-no-std" +homepage = "https://github.com/CrabNejonas/fst" +repository = "https://github.com/CrabNejonas/fst" readme = "README.md" keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" From 9c07a5333e3319a0956818b1000a7152fe1153ec Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:30:57 +0100 Subject: [PATCH 15/28] Update Cargo.toml --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 4805d752..1cf13a98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,6 @@ edition = "2018" [workspace] members = ["bench"] -exclude = ["fst-levenshtein", "fst-regex"] [features] default = ["std"] From 21da075fc07092c83f144dab7c04872e29606a5d Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 18:32:45 +0100 Subject: [PATCH 16/28] Update Cargo.toml --- bench/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bench/Cargo.toml b/bench/Cargo.toml index 07fce0a4..7d334f88 100644 --- a/bench/Cargo.toml +++ b/bench/Cargo.toml @@ -20,4 +20,4 @@ path = "src/bench.rs" [dependencies] criterion = "0.3.1" fnv = "1.0.6" -fst = { path = "..", features = ["levenshtein"] } +fst-no-std = { path = "..", features = ["levenshtein"] } From 24989ae3115e727c79de63e0836ee9d5dac5a004 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:03:07 +0100 Subject: [PATCH 17/28] cleanup --- .github/workflows/ci.yml | 11 +++++-- Cargo.toml | 2 +- README.md | 2 +- src/automaton/levenshtein.rs | 12 ++++--- src/automaton/mod.rs | 12 +++---- src/bytes.rs | 3 ++ src/lib.rs | 14 ++++----- src/map.rs | 58 +++++++++++++++++----------------- src/raw/build.rs | 28 +++++------------ src/raw/common_inputs.rs | 1 + src/raw/counting_writer.rs | 3 -- src/raw/error.rs | 7 +++-- src/raw/mod.rs | 14 +++++++-- src/raw/node.rs | 18 +++++++++-- src/raw/ops.rs | 13 +++++--- src/raw/registry.rs | 21 +++++++------ src/set.rs | 61 +++++++++++++++++++----------------- tests/test.rs | 18 +++++------ 18 files changed, 165 insertions(+), 133 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7d3566f9..0eb03e23 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -34,8 +34,15 @@ jobs: strategy: fail-fast: false matrix: - rust: [nightly, stable] - features: ["", "--no-default-features", "--no-default-features --features alloc"] + include: + - rust: stable + features: "" + - rust: nightly + features: "" + - rust: nightly + features: "--no-default-features" + - rust: nightly + features: "--no-default-features --features alloc" timeout-minutes: 45 steps: - uses: actions/checkout@v3 diff --git a/Cargo.toml b/Cargo.toml index 1cf13a98..b879670b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ members = ["bench"] [features] default = ["std"] -levenshtein = ["utf8-ranges"] +levenshtein = ["utf8-ranges", "std"] std = ["alloc"] alloc = [] diff --git a/README.md b/README.md index 9ceba51f..63b9a6ee 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ Check out the documentation for a lot more examples! * `alloc` - **Enabled** by default. Adds features that depend on `alloc`. * `levenshtein` - **Disabled** by default. This adds the `Levenshtein` automaton to the `automaton` sub-module. This includes an additional - dependency on `utf8-ranges`. + dependency on `utf8-ranges` and `std`. ### `no_std` Usage diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs index 63b0dbf0..b30c4324 100644 --- a/src/automaton/levenshtein.rs +++ b/src/automaton/levenshtein.rs @@ -1,9 +1,9 @@ use core::cmp; use core::fmt; -#[cfg(feature = "alloc")] -use alloc::collections::hash_map::Entry; -#[cfg(feature = "alloc")] -use alloc::collections::{HashMap, HashSet}; +#[cfg(feature = "std")] +use std::collections::hash_map::Entry; +#[cfg(feature = "std")] +use std::collections::{HashMap, HashSet}; use utf8_ranges::{Utf8Range, Utf8Sequences}; @@ -36,8 +36,12 @@ impl fmt::Display for LevenshteinError { } } +#[cfg(not(feature = "std"))] impl core::error::Error for LevenshteinError {} +#[cfg(feature = "std")] +impl std::error::Error for LevenshteinError {} + /// A Unicode aware Levenshtein automaton for running efficient fuzzy queries. /// /// This is only defined when the `levenshtein` crate feature is enabled. diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index fe503ed6..afe37630 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -144,10 +144,10 @@ impl<'a, T: Automaton> Automaton for &'a T { /// starting with a given prefix. /// /// ```rust -/// extern crate fst; +/// extern crate fst_no_std; /// -/// use fst::{Automaton, IntoStreamer, Streamer, Set}; -/// use fst::automaton::Str; +/// use fst_no_std::{Automaton, IntoStreamer, Streamer, Set}; +/// use fst_no_std::automaton::Str; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { @@ -216,10 +216,10 @@ impl<'a> Automaton for Str<'a> { /// It can be used to build a simple fuzzy-finder. /// /// ```rust -/// extern crate fst; +/// extern crate fst_no_std; /// -/// use fst::{IntoStreamer, Streamer, Set}; -/// use fst::automaton::Subsequence; +/// use fst_no_std::{IntoStreamer, Streamer, Set}; +/// use fst_no_std::automaton::Subsequence; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { diff --git a/src/bytes.rs b/src/bytes.rs index 943feaa8..bda32465 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -19,6 +19,7 @@ pub fn read_u64_le(slice: &[u8]) -> u64 { /// Write a u32 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 4. #[inline] +#[cfg(feature = "std")] pub fn write_u32_le(n: u32, slice: &mut [u8]) { assert!(slice.len() >= 4); let bytes = n.to_le_bytes(); @@ -41,6 +42,7 @@ pub fn io_write_u32_le(n: u32, mut wtr: W) -> io::Result<()> { /// Write a u64 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 8. #[inline] +#[cfg(feature = "std")] pub fn write_u64_le(n: u64, slice: &mut [u8]) { assert!(slice.len() >= 8); let bytes = n.to_le_bytes(); @@ -113,6 +115,7 @@ pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { /// pack_size returns the smallest number of bytes that can encode `n`. #[inline] +#[cfg(feature = "std")] pub fn pack_size(n: u64) -> u8 { if n < 1 << 8 { 1 diff --git a/src/lib.rs b/src/lib.rs index 684f4670..d1c1dc21 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -97,11 +97,11 @@ crate to make the file available as a `&[u8]` without necessarily reading it all into memory (the operating system will automatically handle that for you). ```rust,no_run -# fn example() -> Result<(), fst::Error> { +# fn example() -> Result<(), fst_no_std::Error> { use std::fs::File; use std::io; -use fst::{IntoStreamer, Streamer, Map, MapBuilder}; +use fst_no_std::{IntoStreamer, Streamer, Map, MapBuilder}; use memmap2::Mmap; // This is where we'll write our map to. @@ -182,9 +182,9 @@ The example below shows how to find all keys that start with `B` or `G`. The example below uses sets, but the same operations are available on maps too. ```rust -use fst::automaton::{Automaton, Str}; -use fst::set; -use fst::{IntoStreamer, Set, Streamer}; +use fst_no_std::automaton::{Automaton, Str}; +use fst_no_std::set; +use fst_no_std::{IntoStreamer, Set, Streamer}; # fn main() { example().unwrap(); } fn example() -> Result<(), Box> { @@ -312,10 +312,10 @@ doc_comment::doctest!("../README.md"); pub use crate::automaton::Automaton; pub use crate::error::{Error, Result}; pub use crate::map::Map; -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub use crate::map::MapBuilder; pub use crate::set::Set; -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub use crate::set::SetBuilder; pub use crate::stream::{IntoStreamer, Streamer}; diff --git a/src/map.rs b/src/map.rs index be735ba8..59c2df1d 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1,9 +1,11 @@ #[cfg(feature = "alloc")] use core::fmt; #[cfg(feature = "std")] -use std::io; +use core::iter; #[cfg(feature = "alloc")] -use core::iter::{self, FromIterator}; +use core::iter::FromIterator; +#[cfg(feature = "std")] +use std::io; #[cfg(feature = "alloc")] use crate::automaton::{AlwaysMatch, Automaton}; @@ -14,7 +16,7 @@ use crate::stream::IntoStreamer; use crate::stream::Streamer; use crate::Result; #[cfg(feature = "alloc")] -use alloc::{vec::Vec, string::String}; +use alloc::{string::String, vec::Vec}; /// Map is a lexicographically ordered map from byte strings to integers. /// @@ -96,7 +98,7 @@ impl> Map { /// # Example /// /// ```no_run - /// use fst::Map; + /// use fst_no_std::Map; /// /// // File written from a build script using MapBuilder. /// # const IGNORE: &str = stringify! { @@ -115,7 +117,7 @@ impl> Map { /// # Example /// /// ```rust - /// use fst::Map; + /// use fst_no_std::Map; /// /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); /// @@ -133,7 +135,7 @@ impl> Map { /// # Example /// /// ```rust - /// use fst::Map; + /// use fst_no_std::Map; /// /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); /// @@ -161,7 +163,7 @@ impl> Map { /// used. `while let` is useful instead: /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; /// /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); /// let mut stream = map.stream(); @@ -189,7 +191,7 @@ impl> Map { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; /// /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); /// let mut stream = map.keys(); @@ -214,7 +216,7 @@ impl> Map { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; /// /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); /// let mut stream = map.values(); @@ -245,7 +247,7 @@ impl> Map { /// Returns only the key-value pairs in the range given. /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; /// /// let map = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5), @@ -287,8 +289,8 @@ impl> Map { /// to search maps: /// /// ```rust - /// use fst::automaton::Subsequence; - /// use fst::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::automaton::Subsequence; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { @@ -338,8 +340,8 @@ An implementation of fuzzy search using Levenshtein automata can be used to search maps: ```rust -use fst::automaton::Levenshtein; -use fst::{IntoStreamer, Streamer, Map}; +use fst_no_std::automaton::Levenshtein; +use fst_no_std::{IntoStreamer, Streamer, Map}; # fn main() { example().unwrap(); } fn example() -> Result<(), Box> { @@ -405,8 +407,8 @@ fn example() -> Result<(), Box> { /// that same key in the all of the streams. /// /// ```rust - /// use fst::{Streamer, Map}; - /// use fst::map::IndexedValue; + /// use fst_no_std::{Streamer, Map}; + /// use fst_no_std::map::IndexedValue; /// /// let map1 = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), @@ -461,7 +463,7 @@ fn example() -> Result<(), Box> { /// ``` /// use std::borrow::Cow; /// - /// use fst::Map; + /// use fst_no_std::Map; /// /// let map: Map> = Map::from_iter( /// [("hello", 12), ("world", 42)].iter().cloned(), @@ -566,7 +568,7 @@ impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { /// goal without needing to explicitly use `MapBuilder`. /// /// ```rust -/// use fst::{IntoStreamer, Streamer, Map, MapBuilder}; +/// use fst_no_std::{IntoStreamer, Streamer, Map, MapBuilder}; /// /// let mut build = MapBuilder::memory(); /// build.insert("bruce", 1).unwrap(); @@ -599,7 +601,7 @@ impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { /// use std::fs::File; /// use std::io; /// -/// use fst::{IntoStreamer, Streamer, Map, MapBuilder}; +/// use fst_no_std::{IntoStreamer, Streamer, Map, MapBuilder}; /// /// let mut wtr = io::BufWriter::new(File::create("map.fst").unwrap()); /// let mut build = MapBuilder::new(wtr).unwrap(); @@ -626,7 +628,7 @@ impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { /// (b"stevie".to_vec(), 3), /// ]); /// ``` -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub struct MapBuilder(raw::Builder); #[cfg(feature = "std")] @@ -1033,8 +1035,8 @@ impl<'m> OpBuilder<'m> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; - /// use fst::map::IndexedValue; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::map::IndexedValue; /// /// let map1 = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), @@ -1078,8 +1080,8 @@ impl<'m> OpBuilder<'m> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; - /// use fst::map::IndexedValue; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::map::IndexedValue; /// /// let map1 = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), @@ -1125,8 +1127,8 @@ impl<'m> OpBuilder<'m> { /// # Example /// /// ```rust - /// use fst::{Streamer, Map}; - /// use fst::map::IndexedValue; + /// use fst_no_std::{Streamer, Map}; + /// use fst_no_std::map::IndexedValue; /// /// let map1 = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), @@ -1171,8 +1173,8 @@ impl<'m> OpBuilder<'m> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Map}; - /// use fst::map::IndexedValue; + /// use fst_no_std::{IntoStreamer, Streamer, Map}; + /// use fst_no_std::map::IndexedValue; /// /// let map1 = Map::from_iter(vec![ /// ("a", 1), ("b", 2), ("c", 3), diff --git a/src/raw/build.rs b/src/raw/build.rs index b74448ca..3bee7a39 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -1,29 +1,15 @@ -#[cfg(feature = "std")] -use std::io; -#[cfg(feature = "std")] use crate::bytes; -#[cfg(feature = "std")] use crate::error::Result; -#[cfg(feature = "alloc")] use crate::raw::counting_writer::CountingWriter; -#[cfg(feature = "std")] use crate::raw::error::Error; -#[cfg(feature = "alloc")] use crate::raw::registry::Registry; -#[cfg(feature = "std")] use crate::raw::registry::RegistryEntry; use crate::raw::Output; -#[cfg(feature = "alloc")] -use crate::raw::{CompiledAddr, Fst, Transition}; -#[cfg(feature = "std")] -use crate::raw::{ - FstType, EMPTY_ADDRESS, - NONE_ADDRESS, VERSION, -}; -#[cfg(feature = "std")] +use crate::raw::{CompiledAddr, Transition}; +use crate::raw::{Fst, FstType, EMPTY_ADDRESS, NONE_ADDRESS, VERSION}; use crate::stream::{IntoStreamer, Streamer}; -#[cfg(feature = "alloc")] -use alloc::{vec::Vec, vec}; +use alloc::{vec, vec::Vec}; +use std::io; /// A builder for creating a finite state transducer. /// @@ -53,7 +39,7 @@ use alloc::{vec::Vec, vec}; /// /// The algorithmic complexity of fst construction is `O(n)` where `n` is the /// number of elements added to the fst. -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub struct Builder { /// The FST raw data is written directly to `wtr`. /// @@ -344,7 +330,7 @@ impl Builder { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] impl UnfinishedNodes { fn new() -> UnfinishedNodes { let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) }; @@ -442,7 +428,7 @@ impl UnfinishedNodes { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] impl BuilderNodeUnfinished { fn last_compiled(&mut self, addr: CompiledAddr) { if let Some(trans) = self.last.take() { diff --git a/src/raw/common_inputs.rs b/src/raw/common_inputs.rs index f7d7d23b..864801d5 100644 --- a/src/raw/common_inputs.rs +++ b/src/raw/common_inputs.rs @@ -1,3 +1,4 @@ +#[cfg(feature = "std")] pub const COMMON_INPUTS: [u8; 256] = [ 84, // '\x00' 85, // '\x01' diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs index 1fce9bbd..70d9315f 100644 --- a/src/raw/counting_writer.rs +++ b/src/raw/counting_writer.rs @@ -1,4 +1,3 @@ -#[cfg(feature = "std")] use std::io; use crate::raw::crc32::CheckSummer; @@ -10,7 +9,6 @@ pub struct CountingWriter { summer: CheckSummer, } -#[cfg(feature = "std")] impl CountingWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CountingWriter { @@ -45,7 +43,6 @@ impl CountingWriter { } } -#[cfg(feature = "std")] impl io::Write for CountingWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.summer.update(buf); diff --git a/src/raw/error.rs b/src/raw/error.rs index 35c0effa..5db7b7d2 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -1,9 +1,10 @@ -use core::fmt; -use core::str; #[cfg(feature = "alloc")] use alloc::string::FromUtf8Error; #[cfg(feature = "alloc")] -use alloc::{vec::Vec, string::String, borrow::ToOwned, format}; +use alloc::{borrow::ToOwned, format, string::String, vec::Vec}; +use core::fmt; +#[cfg(feature = "alloc")] +use core::str; use crate::raw::FstType; diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 918f0f31..94dde75b 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -21,23 +21,29 @@ Most of the rest of the types are streams from set operations. use core::cmp; use core::fmt; +#[cfg(feature = "alloc")] use crate::automaton::{AlwaysMatch, Automaton}; use crate::bytes; use crate::error::Result; +#[cfg(feature = "alloc")] use crate::stream::{IntoStreamer, Streamer}; -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub use crate::raw::build::Builder; pub use crate::raw::error::Error; pub use crate::raw::node::{Node, Transitions}; pub use crate::raw::ops::IndexedValue; #[cfg(feature = "alloc")] -pub use crate::raw::ops::{Difference, Intersection, OpBuilder, SymmetricDifference, Union}; +pub use crate::raw::ops::{ + Difference, Intersection, OpBuilder, SymmetricDifference, Union, +}; #[cfg(feature = "alloc")] -use alloc::{vec::Vec, vec, borrow::ToOwned, string::String}; +use alloc::{borrow::ToOwned, string::String, vec, vec::Vec}; +#[cfg(feature = "std")] mod build; mod common_inputs; +#[cfg(feature = "std")] mod counting_writer; mod crc32; mod crc32_table; @@ -797,6 +803,7 @@ impl<'f> FstRef<'f> { } #[inline] + #[cfg(feature = "alloc")] fn empty_final_output(&self) -> Option { let root = self.root(); if root.is_final() { @@ -1111,6 +1118,7 @@ where } #[derive(Clone, Debug)] +#[cfg(feature = "alloc")] struct StreamState<'f, S> { node: Node<'f>, trans: usize, diff --git a/src/raw/node.rs b/src/raw/node.rs index 455996f6..f475f7a7 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -1,14 +1,16 @@ #[cfg(feature = "std")] use core::cmp; use core::fmt; +use core::ops::Range; #[cfg(feature = "std")] use std::io; -use core::ops::Range; use crate::bytes; #[cfg(feature = "std")] use crate::raw::build::BuilderNode; -use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; +#[cfg(feature = "std")] +use crate::raw::common_inputs::COMMON_INPUTS; +use crate::raw::common_inputs::COMMON_INPUTS_INV; use crate::raw::{ u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS, }; @@ -330,11 +332,13 @@ impl StateOneTransNext { } #[inline] + #[cfg(feature = "std")] fn new() -> StateOneTransNext { StateOneTransNext(0b11_000000) } #[inline] + #[cfg(feature = "std")] fn set_common_input(&mut self, input: u8) { self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111); } @@ -400,11 +404,13 @@ impl StateOneTrans { } #[inline] + #[cfg(feature = "std")] fn new() -> StateOneTrans { StateOneTrans(0b10_000000) } #[inline] + #[cfg(feature = "std")] fn set_common_input(&mut self, input: u8) { self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111); } @@ -548,11 +554,13 @@ impl StateAnyTrans { } #[inline] + #[cfg(feature = "std")] fn new() -> StateAnyTrans { StateAnyTrans(0b00_000000) } #[inline] + #[cfg(feature = "std")] fn set_final_state(&mut self, yes: bool) { if yes { self.0 |= 0b01_000000; @@ -565,6 +573,7 @@ impl StateAnyTrans { } #[inline] + #[cfg(feature = "std")] fn set_state_ntrans(&mut self, n: u8) { if n <= 0b00_111111 { self.0 = (self.0 & 0b11_000000) | n; @@ -755,11 +764,13 @@ impl PackSizes { } #[inline] + #[cfg(feature = "std")] fn encode(&self) -> u8 { self.0 } #[inline] + #[cfg(feature = "std")] fn set_transition_pack_size(&mut self, size: u8) { assert!(size <= 8); self.0 = (self.0 & 0b0000_1111) | (size << 4); @@ -771,6 +782,7 @@ impl PackSizes { } #[inline] + #[cfg(feature = "std")] fn set_output_pack_size(&mut self, size: u8) { assert!(size <= 8); self.0 = (self.0 & 0b1111_0000) | size; @@ -811,6 +823,7 @@ impl<'f, 'n> Iterator for Transitions<'f, 'n> { /// Nevertheless, the *caller* may have a priori knowledge that could be /// supplied to the builder manually, which could then be embedded in the FST. #[inline] +#[cfg(feature = "std")] fn common_idx(input: u8, max: u8) -> u8 { let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8; if val > max { @@ -860,6 +873,7 @@ fn pack_delta_in( } #[inline] +#[cfg(feature = "std")] fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { let delta_addr = if trans_addr == EMPTY_ADDRESS { EMPTY_ADDRESS diff --git a/src/raw/ops.rs b/src/raw/ops.rs index a8215321..6f0a09dd 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -1,13 +1,16 @@ -use core::cmp; #[cfg(feature = "alloc")] -use alloc::{collections::BinaryHeap, boxed::Box}; +use crate::raw::Output; +#[cfg(feature = "alloc")] +use crate::stream::{IntoStreamer, Streamer}; +#[cfg(feature = "alloc")] +use alloc::{boxed::Box, collections::BinaryHeap}; #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; +#[cfg(feature = "alloc")] +use core::cmp; +#[cfg(feature = "alloc")] use core::iter::FromIterator; -use crate::raw::Output; -use crate::stream::{IntoStreamer, Streamer}; - /// Permits stream operations to be hetergeneous with respect to streams. #[cfg(feature = "alloc")] type BoxedStream<'f> = diff --git a/src/raw/registry.rs b/src/raw/registry.rs index bc027503..8b314d4f 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -1,10 +1,11 @@ -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] use crate::raw::build::BuilderNode; +#[cfg(feature = "std")] use crate::raw::{CompiledAddr, NONE_ADDRESS}; -#[cfg(feature = "alloc")] -use alloc::{vec::Vec, vec}; +#[cfg(feature = "std")] +use alloc::vec::Vec; #[derive(Debug)] -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub struct Registry { table: Vec, table_size: usize, // number of rows @@ -12,27 +13,27 @@ pub struct Registry { } #[derive(Debug)] -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] struct RegistryCache<'a> { cells: &'a mut [RegistryCell], } #[derive(Clone, Debug)] -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub struct RegistryCell { addr: CompiledAddr, node: BuilderNode, } #[derive(Debug)] -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub enum RegistryEntry<'a> { Found(CompiledAddr), NotFound(&'a mut RegistryCell), Rejected, } -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] impl Registry { pub fn new(table_size: usize, mru_size: usize) -> Registry { let empty_cell = RegistryCell::none(); @@ -69,7 +70,7 @@ impl Registry { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] impl<'a> RegistryCache<'a> { fn entry(mut self, node: &BuilderNode) -> RegistryEntry<'a> { if self.cells.len() == 1 { @@ -120,7 +121,7 @@ impl<'a> RegistryCache<'a> { } } -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] impl RegistryCell { fn none() -> RegistryCell { RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() } diff --git a/src/set.rs b/src/set.rs index 5ed31cfe..a8add849 100644 --- a/src/set.rs +++ b/src/set.rs @@ -1,15 +1,20 @@ #[cfg(feature = "alloc")] -use core::fmt; -#[cfg(feature = "std")] -use std::io; -use core::iter::{self, FromIterator}; -#[cfg(feature = "alloc")] -use alloc::{vec::Vec, string::String}; - use crate::automaton::{AlwaysMatch, Automaton}; use crate::raw; -use crate::stream::{IntoStreamer, Streamer}; +#[cfg(feature = "alloc")] +use crate::stream::IntoStreamer; +use crate::stream::Streamer; use crate::Result; +#[cfg(feature = "alloc")] +use alloc::{string::String, vec::Vec}; +#[cfg(feature = "alloc")] +use core::fmt; +#[cfg(feature = "std")] +use core::iter; +#[cfg(feature = "alloc")] +use core::iter::FromIterator; +#[cfg(feature = "std")] +use std::io; /// Set is a lexicographically ordered set of byte strings. /// @@ -67,7 +72,7 @@ impl> Set { /// # Example /// /// ```no_run - /// use fst::Set; + /// use fst_no_std::Set; /// /// // File written from a build script using SetBuilder. /// # const IGNORE: &str = stringify! { @@ -86,7 +91,7 @@ impl> Set { /// # Example /// /// ```rust - /// use fst::Set; + /// use fst_no_std::Set; /// /// let set = Set::from_iter(&["a", "b", "c"]).unwrap(); /// @@ -113,7 +118,7 @@ impl> Set { /// used. `while let` is useful instead: /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let mut stream = set.stream(); @@ -144,7 +149,7 @@ impl> Set { /// Returns only the keys in the range given. /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set = Set::from_iter(&["a", "b", "c", "d", "e"]).unwrap(); /// let mut stream = set.range().ge("b").lt("e").into_stream(); @@ -174,8 +179,8 @@ impl> Set { /// to search sets: /// /// ```rust - /// use fst::automaton::Subsequence; - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::automaton::Subsequence; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// # fn main() { example().unwrap(); } /// fn example() -> Result<(), Box> { @@ -220,8 +225,8 @@ An implementation of fuzzy search using Levenshtein automata can be used to search sets: ```rust -use fst::automaton::Levenshtein; -use fst::{IntoStreamer, Streamer, Set}; +use fst_no_std::automaton::Levenshtein; +use fst_no_std::{IntoStreamer, Streamer, Set}; # fn main() { example().unwrap(); } fn example() -> Result<(), Box> { @@ -280,7 +285,7 @@ fn example() -> Result<(), Box> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap(); @@ -307,7 +312,7 @@ fn example() -> Result<(), Box> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap(); @@ -334,7 +339,7 @@ fn example() -> Result<(), Box> { /// # Example /// /// ```rust - /// use fst::Set; + /// use fst_no_std::Set; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap(); @@ -362,7 +367,7 @@ fn example() -> Result<(), Box> { /// # Example /// /// ```rust - /// use fst::Set; + /// use fst_no_std::Set; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["x", "y", "z"]).unwrap(); @@ -406,7 +411,7 @@ fn example() -> Result<(), Box> { /// ``` /// use std::borrow::Cow; /// - /// use fst::Set; + /// use fst_no_std::Set; /// /// let set: Set> = Set::from_iter( /// &["hello", "world"], @@ -509,7 +514,7 @@ impl> From> for Set { /// goal without needing to explicitly use `SetBuilder`. /// /// ```rust -/// use fst::{IntoStreamer, Streamer, Set, SetBuilder}; +/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder}; /// /// let mut build = SetBuilder::memory(); /// build.insert("bruce").unwrap(); @@ -540,7 +545,7 @@ impl> From> for Set { /// use std::fs::File; /// use std::io; /// -/// use fst::{IntoStreamer, Streamer, Set, SetBuilder}; +/// use fst_no_std::{IntoStreamer, Streamer, Set, SetBuilder}; /// /// let mut wtr = io::BufWriter::new(File::create("set.fst").unwrap()); /// let mut build = SetBuilder::new(wtr).unwrap(); @@ -565,7 +570,7 @@ impl> From> for Set { /// "bruce".as_bytes(), "clarence".as_bytes(), "stevie".as_bytes(), /// ]); /// ``` -#[cfg(feature = "alloc")] +#[cfg(feature = "std")] pub struct SetBuilder(raw::Builder); #[cfg(feature = "std")] @@ -894,7 +899,7 @@ impl<'s> OpBuilder<'s> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap(); @@ -917,7 +922,7 @@ impl<'s> OpBuilder<'s> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap(); @@ -942,7 +947,7 @@ impl<'s> OpBuilder<'s> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap(); @@ -972,7 +977,7 @@ impl<'s> OpBuilder<'s> { /// # Example /// /// ```rust - /// use fst::{IntoStreamer, Streamer, Set}; + /// use fst_no_std::{IntoStreamer, Streamer, Set}; /// /// let set1 = Set::from_iter(&["a", "b", "c"]).unwrap(); /// let set2 = Set::from_iter(&["a", "y", "z"]).unwrap(); diff --git a/tests/test.rs b/tests/test.rs index c7d733c7..2e05b4d8 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,10 +1,10 @@ #[cfg(feature = "levenshtein")] -use fst::automaton::Levenshtein; -use fst::automaton::{Str, Subsequence}; +use fst_no_std::automaton::Levenshtein; +use fst_no_std::automaton::{Str, Subsequence}; #[cfg(feature = "levenshtein")] -use fst::raw::{Builder, Fst}; -use fst::set::Set; -use fst::{self, Automaton, IntoStreamer, Streamer}; +use fst_no_std::raw::{Builder, Fst}; +use fst_no_std::set::Set; +use fst_no_std::{self, Automaton, IntoStreamer, Streamer}; static WORDS: &'static str = include_str!("../data/words-10000"); @@ -110,7 +110,7 @@ fn union_small() { #[cfg(feature = "levenshtein")] #[test] fn intersection_large() { - use fst::set::OpBuilder; + use fst_no_std::set::OpBuilder; let set = get_set(); let lev = Levenshtein::new("foo", 3).unwrap(); @@ -129,7 +129,7 @@ fn intersection_large() { #[cfg(feature = "levenshtein")] #[test] fn union_large() { - use fst::set::OpBuilder; + use fst_no_std::set::OpBuilder; let set = get_set(); let lev = Levenshtein::new("foo", 3).unwrap(); @@ -177,9 +177,9 @@ fn subsequence() { #[test] fn implements_default() { - let map: fst::Map> = Default::default(); + let map: fst_no_std::Map> = Default::default(); assert!(map.is_empty()); - let set: fst::Set> = Default::default(); + let set: fst_no_std::Set> = Default::default(); assert!(set.is_empty()); } From 912bf04d5fa33330691581d21fecd82ee91f643c Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Wed, 17 Jan 2024 19:08:44 +0100 Subject: [PATCH 18/28] clippy --- build.rs | 17 +++++++++-------- src/raw/error.rs | 9 +-------- src/raw/mod.rs | 1 + src/stream.rs | 2 +- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/build.rs b/build.rs index 283164f2..7363a9e3 100644 --- a/build.rs +++ b/build.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Write}; use std::path::{Path, PathBuf}; -const CASTAGNOLI_POLY: u32 = 0x82f63b78; +const CASTAGNOLI_POLY: u32 = 0x82f6_3b78; type Result = std::result::Result>; @@ -38,8 +38,8 @@ fn write_tag_lookup_table(out_dir: &Path) -> Result<()> { } fn tag_entry(b: u8) -> u16 { - let b = b as u16; - match b & 0b00000011 { + let b = u16::from(b); + match b & 0b0000_0011 { 0b00 => { let lit_len = (b >> 2) + 1; if lit_len <= 60 { @@ -74,16 +74,16 @@ fn write_crc_tables(out_dir: &Path) -> Result<()> { let table16 = make_table16(CASTAGNOLI_POLY); writeln!(out, "pub const TABLE: [u32; 256] = [")?; - for &x in table.iter() { - writeln!(out, " {},", x)?; + for &x in &table { + writeln!(out, " {x},")?; } writeln!(out, "];\n")?; writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?; - for table in table16.iter() { + for table in &table16 { writeln!(out, " [")?; - for &x in table.iter() { - writeln!(out, " {},", x)?; + for &x in table { + writeln!(out, " {x},")?; } writeln!(out, " ],")?; } @@ -94,6 +94,7 @@ fn write_crc_tables(out_dir: &Path) -> Result<()> { Ok(()) } +#[allow(clippy::cast_possible_truncation)] fn make_table16(poly: u32) -> [[u32; 256]; 16] { let mut tab = [[0; 256]; 16]; tab[0] = make_table(poly); diff --git a/src/raw/error.rs b/src/raw/error.rs index 5db7b7d2..195f8ba8 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -12,6 +12,7 @@ use crate::raw::FstType; /// /// This enum is non-exhaustive. New variants may be added to it in /// compatible releases. +#[non_exhaustive] pub enum Error { /// A version mismatch occurred while reading a finite state transducer. /// @@ -80,13 +81,6 @@ pub enum Error { /// An error that occurred when trying to decode a UTF-8 byte key. #[cfg(feature = "alloc")] FromUtf8(FromUtf8Error), - /// Hints that destructuring should not be exhaustive. - /// - /// This enum may grow additional variants, so this makes sure clients - /// don't count on exhaustive matching. (Otherwise, adding a new variant - /// could break existing code.) - #[doc(hidden)] - __Nonexhaustive, } impl fmt::Display for Error { @@ -141,7 +135,6 @@ inserted in lexicographic order.", Error opening FST: expected type '{}', got type '{}'.", expected, got ), - Error::__Nonexhaustive => unreachable!(), } } } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 94dde75b..6f512bb8 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -1407,6 +1407,7 @@ impl fmt::Debug for Transition { #[inline] #[cfg(target_pointer_width = "64")] +#[allow(clippy::cast_possible_truncation)] fn u64_to_usize(n: u64) -> usize { n as usize } diff --git a/src/stream.rs b/src/stream.rs index e78a5965..28153576 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -106,7 +106,7 @@ pub trait Streamer<'a> { fn next(&'a mut self) -> Option; } -/// IntoStreamer describes types that can be converted to streams. +/// `IntoStreamer` describes types that can be converted to streams. /// /// This is analogous to the `IntoIterator` trait for `Iterator` in /// `std::iter`. From bdc90af82e6cdafa69370eecd2e977569d5695e7 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:40:12 +0100 Subject: [PATCH 19/28] clippy --- .github/workflows/ci.yml | 2 +- src/automaton/levenshtein.rs | 48 +++++++++++------------ src/automaton/mod.rs | 20 +++++----- src/bytes.rs | 24 ++++++------ src/lib.rs | 7 ++-- src/map.rs | 22 +++++++---- src/raw/build.rs | 13 +++--- src/raw/crc32.rs | 2 +- src/raw/error.rs | 24 +++++------- src/raw/mod.rs | 40 ++++++++++--------- src/raw/node.rs | 76 +++++++++++++++--------------------- src/raw/ops.rs | 18 +++++++-- src/raw/registry.rs | 12 +++--- src/raw/tests.rs | 10 ++--- src/set.rs | 20 ++++++---- tests/test.rs | 4 +- 16 files changed, 175 insertions(+), 167 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0eb03e23..f287be67 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -79,7 +79,7 @@ jobs: with: components: clippy - uses: Swatinem/rust-cache@v2 - - run: cargo clippy --workspace -- -Dclippy::all -Dclippy::pedantic + - run: cargo clippy --workspace -- -Dclippy::all rustfmt: name: Rustfmt diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs index b30c4324..addc58fa 100644 --- a/src/automaton/levenshtein.rs +++ b/src/automaton/levenshtein.rs @@ -29,8 +29,7 @@ impl fmt::Display for LevenshteinError { LevenshteinError::TooManyStates(size_limit) => write!( f, "Levenshtein automaton exceeds size limit of \ - {} states", - size_limit + {size_limit} states" ), } } @@ -61,27 +60,26 @@ impl std::error::Error for LevenshteinError {} /// from `foo`. /// /// ```rust -/// use fst::automaton::Levenshtein; -/// use fst::{IntoStreamer, Streamer, Set}; +/// use fst_no_std::automaton::Levenshtein; +/// use fst_no_std::{IntoStreamer, Streamer, Set}; /// -/// fn main() { -/// let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; -/// let set = Set::from_iter(keys).unwrap(); +/// let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; +/// let set = Set::from_iter(keys).unwrap(); /// -/// let lev = Levenshtein::new("foo", 1).unwrap(); -/// let mut stream = set.search(&lev).into_stream(); +/// let lev = Levenshtein::new("foo", 1).unwrap(); +/// let mut stream = set.search(&lev).into_stream(); /// -/// let mut keys = vec![]; -/// while let Some(key) = stream.next() { -/// keys.push(key.to_vec()); -/// } -/// assert_eq!(keys, vec![ -/// "fo".as_bytes(), // 1 deletion -/// "fob".as_bytes(), // 1 substitution -/// "foo".as_bytes(), // 0 insertions/deletions/substitutions -/// "food".as_bytes(), // 1 insertion -/// ]); +/// let mut keys = vec![]; +/// while let Some(key) = stream.next() { +/// keys.push(key.to_vec()); /// } +/// +/// assert_eq!(keys, vec![ +/// "fo".as_bytes(), // 1 deletion +/// "fob".as_bytes(), // 1 substitution +/// "foo".as_bytes(), // 0 insertions/deletions/substitutions +/// "food".as_bytes(), // 1 insertion +/// ]); /// ``` /// /// This example only uses ASCII characters, but it will work equally well @@ -182,17 +180,17 @@ impl DynamicLevenshtein { } fn is_match(&self, state: &[usize]) -> bool { - state.last().map(|&n| n <= self.dist).unwrap_or(false) + state.last().is_some_and(|&n| n <= self.dist) } fn can_match(&self, state: &[usize]) -> bool { - state.iter().min().map(|&n| n <= self.dist).unwrap_or(false) + state.iter().min().is_some_and(|&n| n <= self.dist) } fn accept(&self, state: &[usize], chr: Option) -> Vec { let mut next = vec![state[0] + 1]; for (i, c) in self.query.chars().enumerate() { - let cost = if Some(c) == chr { 0 } else { 1 }; + let cost = usize::from(Some(c) != chr); let v = cmp::min( cmp::min(next[i] + 1, state[i + 1] + 1), state[i] + cost, @@ -245,7 +243,7 @@ impl fmt::Debug for State { writeln!(f, " is_match: {:?}", self.is_match)?; for i in 0..256 { if let Some(si) = self.next[i] { - writeln!(f, " {:?}: {:?}", i, si)?; + writeln!(f, " {i:?}: {si:?}")?; } } write!(f, "}}") @@ -341,7 +339,7 @@ impl DfaBuilder { // Some((si, false)) => si, }; self.add_utf8_sequences(false, from_si, to_si, '\u{0}', '\u{10FFFF}'); - return Some((to_si, mismatch_state)); + Some((to_si, mismatch_state)) } fn add_utf8_sequences( @@ -375,7 +373,7 @@ impl DfaBuilder { to: usize, range: &Utf8Range, ) { - for b in range.start as usize..range.end as usize + 1 { + for b in (range.start as usize)..=(range.end as usize) { if overwrite || self.dfa.states[from].next[b].is_none() { self.dfa.states[from].next[b] = Some(to); } diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index afe37630..c7aae55e 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -173,6 +173,7 @@ pub struct Str<'a> { impl<'a> Str<'a> { /// Constructs automaton that matches an exact string. #[inline] + #[must_use] pub fn new(string: &'a str) -> Str<'a> { Str { string: string.as_bytes() } } @@ -201,7 +202,7 @@ impl<'a> Automaton for Str<'a> { // if we aren't already past the end... if let Some(pos) = *pos { // and there is still a matching byte at the current position... - if self.string.get(pos).cloned() == Some(byte) { + if self.string.get(pos).copied() == Some(byte) { // then move forward return Some(pos + 1); } @@ -246,6 +247,7 @@ impl<'a> Subsequence<'a> { /// Constructs automaton that matches input containing the /// specified subsequence. #[inline] + #[must_use] pub fn new(subsequence: &'a str) -> Subsequence<'a> { Subsequence { subseq: subsequence.as_bytes() } } @@ -279,7 +281,7 @@ impl<'a> Automaton for Subsequence<'a> { if state == self.subseq.len() { return state; } - state + (byte == self.subseq[state]) as usize + state + usize::from(byte == self.subseq[state]) } } @@ -294,25 +296,21 @@ impl Automaton for AlwaysMatch { type State = (); #[inline] - fn start(&self) -> () { - () - } + fn start(&self) {} #[inline] - fn is_match(&self, _: &()) -> bool { + fn is_match(&self, (): &()) -> bool { true } #[inline] - fn can_match(&self, _: &()) -> bool { + fn can_match(&self, (): &()) -> bool { true } #[inline] - fn will_always_match(&self, _: &()) -> bool { + fn will_always_match(&self, (): &()) -> bool { true } #[inline] - fn accept(&self, _: &(), _: u8) -> () { - () - } + fn accept(&self, (): &(), _: u8) {} } /// An automaton that matches a string that begins with something that the diff --git a/src/bytes.rs b/src/bytes.rs index bda32465..ea10c18d 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -29,7 +29,7 @@ pub fn write_u32_le(n: u32, slice: &mut [u8]) { slice[3] = bytes[3]; } -/// Like write_u32_le, but to an io::Write implementation. If every byte could +/// Like `write_u32_le`, but to an `io::Write` implementation. If every byte could /// not be writen, then this returns an error. #[inline] #[cfg(feature = "std")] @@ -56,7 +56,7 @@ pub fn write_u64_le(n: u64, slice: &mut [u8]) { slice[7] = bytes[7]; } -/// Like write_u64_le, but to an io::Write implementation. If every byte could +/// Like `write_u64_le`, but to an `io::Write` implementation. If every byte could /// not be writen, then this returns an error. #[inline] #[cfg(feature = "std")] @@ -66,20 +66,20 @@ pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { wtr.write_all(&buf) } -/// pack_uint packs the given integer in the smallest number of bytes possible, +/// `pack_uint` packs the given integer in the smallest number of bytes possible, /// and writes it to the given writer. The number of bytes written is returned /// on success. #[inline] #[cfg(feature = "std")] pub fn pack_uint(wtr: W, n: u64) -> io::Result { let nbytes = pack_size(n); - pack_uint_in(wtr, n, nbytes).map(|_| nbytes) + pack_uint_in(wtr, n, nbytes).map(|()| nbytes) } -/// pack_uint_in is like pack_uint, but always uses the number of bytes given +/// `pack_uint_in` is like `pack_uint`, but always uses the number of bytes given /// to pack the number given. /// -/// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the +/// `nbytes` must be >= `pack_size(n`) and <= 8, where `pack_size(n)` is the /// smallest number of bytes that can store the integer given. #[inline] #[cfg(feature = "std")] @@ -88,32 +88,32 @@ pub fn pack_uint_in( mut n: u64, nbytes: u8, ) -> io::Result<()> { - assert!(1 <= nbytes && nbytes <= 8); + assert!((1..=8).contains(&nbytes)); let mut buf = [0u8; 8]; for i in 0..nbytes { buf[i as usize] = n as u8; - n = n >> 8; + n >>= 8; } wtr.write_all(&buf[..nbytes as usize])?; Ok(()) } -/// unpack_uint is the dual of pack_uint. It unpacks the integer at the current +/// `unpack_uint` is the dual of `pack_uint`. It unpacks the integer at the current /// position in `slice` after reading `nbytes` bytes. /// /// `nbytes` must be >= 1 and <= 8. #[inline] pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { - assert!(1 <= nbytes && nbytes <= 8); + assert!((1..=8).contains(&nbytes)); let mut n = 0; for (i, &b) in slice[..nbytes as usize].iter().enumerate() { - n = n | ((b as u64) << (8 * i)); + n |= u64::from(b) << (8 * i); } n } -/// pack_size returns the smallest number of bytes that can encode `n`. +/// `pack_size` returns the smallest number of bytes that can encode `n`. #[inline] #[cfg(feature = "std")] pub fn pack_size(n: u64) -> u8 { diff --git a/src/lib.rs b/src/lib.rs index d1c1dc21..fb417522 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -52,8 +52,8 @@ This requires the `levenshtein` feature in this crate to be enabled. It is not enabled by default. ```rust -use fst::{IntoStreamer, Streamer, Set}; -use fst::automaton::Levenshtein; +use fst_no_std::{IntoStreamer, Streamer, Set}; +use fst_no_std::automaton::Levenshtein; # fn main() { example().unwrap(); } fn example() -> Result<(), Box> { @@ -300,8 +300,9 @@ data structures found in the standard library, such as `BTreeSet` and */ #![cfg_attr(not(feature = "std"), no_std)] -#![deny(missing_docs)] #![cfg_attr(not(feature = "std"), feature(error_in_core))] +#![deny(missing_docs)] +#![allow(clippy::should_implement_trait)] #[cfg(feature = "alloc")] extern crate alloc; diff --git a/src/map.rs b/src/map.rs index 59c2df1d..bd80f3bb 100644 --- a/src/map.rs +++ b/src/map.rs @@ -143,7 +143,7 @@ impl> Map { /// assert_eq!(map.get("z"), None); /// ``` pub fn get>(&self, key: K) -> Option { - self.0.get(key).map(|output| output.value()) + self.0.get(key).map(raw::Output::value) } /// Return a lexicographically ordered stream of all key-value pairs in @@ -635,13 +635,13 @@ pub struct MapBuilder(raw::Builder); impl MapBuilder> { /// Create a builder that builds a map in memory. #[inline] - pub fn memory() -> MapBuilder> { + #[must_use] pub fn memory() -> MapBuilder> { MapBuilder(raw::Builder::memory()) } /// Finishes the construction of the map and returns it. #[inline] - pub fn into_map(self) -> Map> { + #[must_use] pub fn into_map(self) -> Map> { Map(self.0.into_fst()) } } @@ -987,10 +987,16 @@ where pub struct OpBuilder<'m>(raw::OpBuilder<'m>); #[cfg(feature = "alloc")] +impl<'m> Default for OpBuilder<'m> { + fn default() -> Self { + Self::new() + } +} + impl<'m> OpBuilder<'m> { /// Create a new set operation builder. #[inline] - pub fn new() -> OpBuilder<'m> { + #[must_use] pub fn new() -> OpBuilder<'m> { OpBuilder(raw::OpBuilder::new()) } @@ -1063,7 +1069,7 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - pub fn union(self) -> Union<'m> { + #[must_use] pub fn union(self) -> Union<'m> { Union(self.0.union()) } @@ -1104,7 +1110,7 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - pub fn intersection(self) -> Intersection<'m> { + #[must_use] pub fn intersection(self) -> Intersection<'m> { Intersection(self.0.intersection()) } @@ -1149,7 +1155,7 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - pub fn difference(self) -> Difference<'m> { + #[must_use] pub fn difference(self) -> Difference<'m> { Difference(self.0.difference()) } @@ -1197,7 +1203,7 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - pub fn symmetric_difference(self) -> SymmetricDifference<'m> { + #[must_use] pub fn symmetric_difference(self) -> SymmetricDifference<'m> { SymmetricDifference(self.0.symmetric_difference()) } } diff --git a/src/raw/build.rs b/src/raw/build.rs index 3bee7a39..252d6f98 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -103,6 +103,7 @@ struct LastTransition { impl Builder> { /// Create a builder that builds an fst in memory. #[inline] + #[must_use] pub fn memory() -> Builder> { Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap() } @@ -284,7 +285,7 @@ impl Builder { { return Ok(EMPTY_ADDRESS); } - let entry = self.registry.entry(&node); + let entry = self.registry.entry(node); if let RegistryEntry::Found(ref addr) = entry { return Ok(*addr); } @@ -304,7 +305,7 @@ impl Builder { } if bs < &**last { return Err(Error::OutOfOrder { - previous: last.to_vec(), + previous: last.clone(), got: bs.to_vec(), } .into()); @@ -396,8 +397,8 @@ impl UnfinishedNodes { fn find_common_prefix(&mut self, bs: &[u8]) -> usize { bs.iter() .zip(&self.stack) - .take_while(|&(&b, ref node)| { - node.last.as_ref().map(|t| t.inp == b).unwrap_or(false) + .take_while(|&(&b, node)| { + node.last.as_ref().is_some_and(|t| t.inp == b) }) .count() } @@ -413,8 +414,8 @@ impl UnfinishedNodes { Some(ref mut t) if t.inp == bs[i] => { i += 1; let common_pre = t.out.prefix(out); - let add_prefix = t.out.sub(common_pre); - out = out.sub(common_pre); + let add_prefix = t.out - common_pre; + out = out - common_pre; t.out = common_pre; add_prefix } diff --git a/src/raw/crc32.rs b/src/raw/crc32.rs index 06b44420..90379e6a 100644 --- a/src/raw/crc32.rs +++ b/src/raw/crc32.rs @@ -20,7 +20,7 @@ impl CheckSummer { /// robust with respect to data that contains the checksum itself. pub fn masked(&self) -> u32 { let sum = self.sum; - (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8) + (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282_EAD8) } /// Update the current checksum with the checksum for the given bytes. diff --git a/src/raw/error.rs b/src/raw/error.rs index 195f8ba8..94e46eea 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -91,24 +91,21 @@ impl fmt::Display for Error { Error::Version { expected, got } => write!( f, "\ -Error opening FST: expected API version {}, got API version {}. \ +Error opening FST: expected API version {expected}, got API version {got}. \ It looks like the FST you're trying to open is either not an FST file or it \ was generated with a different version of the 'fst' crate. You'll either need \ to change the version of the 'fst' crate you're using, or re-generate the -FST.", - expected, got +FST." ), Error::Format { size } => write!( f, "\ -Error opening FST with size {} bytes: An unknown error occurred. This \ -usually means you're trying to read data that isn't actually an encoded FST.", - size +Error opening FST with size {size} bytes: An unknown error occurred. This \ +usually means you're trying to read data that isn't actually an encoded FST." ), Error::ChecksumMismatch { expected, got } => write!( f, - "FST verification failed: expected checksum of {} but got {}", - expected, got, + "FST verification failed: expected checksum of {expected} but got {got}", ), Error::ChecksumMissing => write!( f, @@ -118,7 +115,7 @@ usually means you're trying to read data that isn't actually an encoded FST.", Error::DuplicateKey { ref got } => write!( f, "Error inserting duplicate key: '{}'.", - format_bytes(&*got) + format_bytes(got) ), #[cfg(feature = "alloc")] Error::OutOfOrder { ref previous, ref got } => write!( @@ -126,14 +123,13 @@ usually means you're trying to read data that isn't actually an encoded FST.", "\ Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \ inserted in lexicographic order.", - format_bytes(&*got), - format_bytes(&*previous) + format_bytes(got), + format_bytes(previous) ), Error::WrongType { expected, got } => write!( f, "\ -Error opening FST: expected type '{}', got type '{}'.", - expected, got +Error opening FST: expected type '{expected}', got type '{got}'." ), } } @@ -184,6 +180,6 @@ impl From for Error { fn format_bytes(bytes: &[u8]) -> String { match str::from_utf8(bytes) { Ok(s) => s.to_owned(), - Err(_) => format!("{:?}", bytes), + Err(_) => format!("{bytes:?}"), } } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 6f512bb8..b027bde7 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -75,14 +75,14 @@ const EMPTY_ADDRESS: CompiledAddr = 0; /// This is never the address of a node in a serialized transducer. const NONE_ADDRESS: CompiledAddr = 1; -/// FstType is a convention used to indicate the type of the underlying +/// `FstType` is a convention used to indicate the type of the underlying /// transducer. /// /// This crate reserves the range 0-255 (inclusive) but currently leaves the /// meaning of 0-255 unspecified. pub type FstType = u64; -/// CompiledAddr is the type used to address nodes in a finite state +/// `CompiledAddr` is the type used to address nodes in a finite state /// transducer. /// /// It is most useful as a pointer to nodes. It can be used in the `Fst::node` @@ -361,7 +361,7 @@ impl> Fst { // unexpected EOF. However, we are reading from a byte slice (no // IO errors possible) and we've confirmed the byte slice is at least // N bytes (no unexpected EOF). - let version = bytes::read_u64_le(&bytes); + let version = bytes::read_u64_le(bytes); if version == 0 || version > VERSION { return Err( Error::Version { expected: VERSION, got: version }.into() @@ -589,7 +589,7 @@ impl> Fst { { let mut op = self.op().add(stream).intersection(); let mut count = 0; - while let Some(_) = op.next() { + while op.next().is_some() { count += 1; } count == self.len() @@ -609,7 +609,7 @@ impl> Fst { { let mut op = self.op().add(stream).union(); let mut count = 0; - while let Some(_) = op.next() { + while op.next().is_some() { count += 1; } count == self.len() @@ -617,7 +617,7 @@ impl> Fst { /// Returns the underlying type of this fst. /// - /// FstType is a convention used to indicate the type of the underlying + /// `FstType` is a convention used to indicate the type of the underlying /// transducer. /// /// This crate reserves the range 0-255 (inclusive) but currently leaves @@ -1001,10 +1001,7 @@ impl Bound { #[inline] fn is_inclusive(&self) -> bool { - match *self { - Bound::Excluded(_) => false, - _ => true, - } + matches!(*self, Bound::Included(_) | Bound::Unbounded) } } @@ -1090,7 +1087,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { type Item = (&'a [u8], Output); fn next(&'a mut self) -> Option<(&'a [u8], Output)> { - self.0.next_with(|_| ()).map(|(key, out, _)| (key, out)) + self.0.next_with(|_| ()).map(|(key, out, ())| (key, out)) } } @@ -1301,7 +1298,7 @@ where type Item = (&'a [u8], Output, A::State); fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> { - self.next_with(|state| state.clone()) + self.next_with(std::clone::Clone::clone) } } @@ -1324,48 +1321,55 @@ pub struct Output(u64); impl Output { /// Create a new output from a `u64`. #[inline] + #[must_use] pub fn new(v: u64) -> Output { Output(v) } /// Create a zero output. #[inline] + #[must_use] pub fn zero() -> Output { Output(0) } /// Retrieve the value inside this output. #[inline] + #[must_use] pub fn value(self) -> u64 { self.0 } /// Returns true if this is a zero output. #[inline] + #[must_use] pub fn is_zero(self) -> bool { self.0 == 0 } /// Returns the prefix of this output and `o`. #[inline] + #[must_use] pub fn prefix(self, o: Output) -> Output { Output(cmp::min(self.0, o.0)) } /// Returns the concatenation of this output and `o`. #[inline] + #[must_use] pub fn cat(self, o: Output) -> Output { Output(self.0 + o.0) } +} + +impl core::ops::Sub for Output { + type Output = Output; - /// Returns the subtraction of `o` from this output. - /// - /// This function panics if `self < o`. #[inline] - pub fn sub(self, o: Output) -> Output { - Output( + fn sub(self, rhs: Self) -> Self::Output { + Self( self.0 - .checked_sub(o.0) + .checked_sub(rhs.0) .expect("BUG: underflow subtraction not allowed"), ) } diff --git a/src/raw/node.rs b/src/raw/node.rs index f475f7a7..5db20b19 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -47,7 +47,7 @@ impl<'f> fmt::Debug for Node<'f> { writeln!(f, " # transitions: {}", self.len())?; writeln!(f, " transitions:")?; for t in self.transitions() { - writeln!(f, " {:?}", t)?; + writeln!(f, " {t:?}")?; } Ok(()) } @@ -76,7 +76,7 @@ impl<'f> Node<'f> { final_output: Output::zero(), }, State::OneTransNext(s) => { - let data = &data[..addr + 1]; + let data = &data[..=addr]; Node { data, version, @@ -90,7 +90,7 @@ impl<'f> Node<'f> { } } State::OneTrans(s) => { - let data = &data[..addr + 1]; + let data = &data[..=addr]; let sizes = s.sizes(data); Node { data, @@ -105,7 +105,7 @@ impl<'f> Node<'f> { } } State::AnyTrans(s) => { - let data = &data[..addr + 1]; + let data = &data[..=addr]; let sizes = s.sizes(data); let ntrans = s.ntrans(data); Node { @@ -125,13 +125,13 @@ impl<'f> Node<'f> { /// Returns an iterator over all transitions in this node in lexicographic /// order. #[inline] - pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { + #[must_use] pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { Transitions { node: self, range: 0..self.len() } } /// Returns the transition at index `i`. #[inline(always)] - pub fn transition(&self, i: usize) -> Transition { + #[must_use] pub fn transition(&self, i: usize) -> Transition { // The `inline(always)` annotation on this function appears to // dramatically speed up FST traversal. In particular, measuring the // time it takes to run `fst range something-big.fst` shows almost a 2x @@ -165,7 +165,7 @@ impl<'f> Node<'f> { /// Returns the transition address of the `i`th transition. #[inline] - pub fn transition_addr(&self, i: usize) -> CompiledAddr { + #[must_use] pub fn transition_addr(&self, i: usize) -> CompiledAddr { match self.state { State::OneTransNext(s) => { assert!(i == 0); @@ -184,7 +184,7 @@ impl<'f> Node<'f> { /// /// If no transition for this byte exists, then `None` is returned. #[inline] - pub fn find_input(&self, b: u8) -> Option { + #[must_use] pub fn find_input(&self, b: u8) -> Option { match self.state { State::OneTransNext(s) if s.input(self) == b => Some(0), State::OneTransNext(_) => None, @@ -198,14 +198,14 @@ impl<'f> Node<'f> { /// If this node is final and has a terminal output value, then it is /// returned. Otherwise, a zero output is returned. #[inline] - pub fn final_output(&self) -> Output { + #[must_use] pub fn final_output(&self) -> Output { self.final_output } /// Returns true if and only if this node corresponds to a final or "match" /// state in the finite state transducer. #[inline] - pub fn is_final(&self) -> bool { + #[must_use] pub fn is_final(&self) -> bool { self.is_final } @@ -213,31 +213,31 @@ impl<'f> Node<'f> { /// /// The maximum number of transitions is 256. #[inline] - pub fn len(&self) -> usize { + #[must_use] pub fn len(&self) -> usize { self.ntrans } /// Returns true if and only if this node has zero transitions. #[inline] - pub fn is_empty(&self) -> bool { + #[must_use] pub fn is_empty(&self) -> bool { self.ntrans == 0 } /// Return the address of this node. #[inline] - pub fn addr(&self) -> CompiledAddr { + #[must_use] pub fn addr(&self) -> CompiledAddr { self.start } #[doc(hidden)] #[inline] - pub fn as_slice(&self) -> &[u8] { + #[must_use] pub fn as_slice(&self) -> &[u8] { &self.data[self.end..] } #[doc(hidden)] #[inline] - pub fn state(&self) -> &'static str { + #[must_use] pub fn state(&self) -> &'static str { match self.state { State::OneTransNext(_) => "OTN", State::OneTrans(_) => "OT", @@ -258,15 +258,13 @@ impl<'f> Node<'f> { && node.is_final && node.final_output.is_zero() { - return Ok(()); + Ok(()) } else if node.trans.len() != 1 || node.is_final { StateAnyTrans::compile(wtr, addr, node) + } else if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() { + StateOneTransNext::compile(wtr, addr, node.trans[0].inp) } else { - if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() { - StateOneTransNext::compile(wtr, addr, node.trans[0].inp) - } else { - StateOneTrans::compile(wtr, addr, node.trans[0]) - } + StateOneTrans::compile(wtr, addr, node.trans[0]) } } } @@ -340,7 +338,7 @@ impl StateOneTransNext { #[inline] #[cfg(feature = "std")] fn set_common_input(&mut self, input: u8) { - self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111); + self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b11_1111); } #[inline] @@ -350,11 +348,7 @@ impl StateOneTransNext { #[inline] fn input_len(&self) -> usize { - if self.common_input().is_none() { - 1 - } else { - 0 - } + usize::from(self.common_input().is_none()) } #[inline] @@ -412,7 +406,7 @@ impl StateOneTrans { #[inline] #[cfg(feature = "std")] fn set_common_input(&mut self, input: u8) { - self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111); + self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b11_1111); } #[inline] @@ -422,11 +416,7 @@ impl StateOneTrans { #[inline] fn input_len(&self) -> usize { - if self.common_input().is_none() { - 1 - } else { - 0 - } + usize::from(self.common_input().is_none()) } #[inline] @@ -618,11 +608,7 @@ impl StateAnyTrans { #[inline] fn ntrans_len(&self) -> usize { - if self.state_ntrans().is_none() { - 1 - } else { - 0 - } + usize::from(self.state_ntrans().is_none()) } #[inline] @@ -812,7 +798,7 @@ impl<'f, 'n> Iterator for Transitions<'f, 'n> { } } -/// common_idx translate a byte to an index in the COMMON_INPUTS_INV array. +/// `common_idx` translate a byte to an index in the `COMMON_INPUTS_INV` array. /// /// I wonder if it would be prudent to store this mapping in the FST itself. /// The advantage of doing so would mean that common inputs would reflect the @@ -825,7 +811,7 @@ impl<'f, 'n> Iterator for Transitions<'f, 'n> { #[inline] #[cfg(feature = "std")] fn common_idx(input: u8, max: u8) -> u8 { - let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8; + let val = ((u32::from(COMMON_INPUTS[input as usize]) + 1) % 256) as u8; if val > max { 0 } else { @@ -833,7 +819,7 @@ fn common_idx(input: u8, max: u8) -> u8 { } } -/// common_input translates a common input index stored in a serialized FST +/// `common_input` translates a common input index stored in a serialized FST /// to the corresponding byte. #[inline] fn common_input(idx: u8) -> Option { @@ -927,16 +913,16 @@ mod tests { } TestResult::from_bool(bs == words) } - quickcheck(p as fn(Vec>) -> TestResult) + quickcheck(p as fn(Vec>) -> TestResult); } fn nodes_equal(compiled: &Node, uncompiled: &BuilderNode) -> bool { - println!("{:?}", compiled); + println!("{compiled:?}"); assert_eq!(compiled.is_final(), uncompiled.is_final); assert_eq!(compiled.len(), uncompiled.trans.len()); assert_eq!(compiled.final_output(), uncompiled.final_output); for (ct, ut) in - compiled.transitions().zip(uncompiled.trans.iter().cloned()) + compiled.transitions().zip(uncompiled.trans.iter().copied()) { assert_eq!(ct.inp, ut.inp); assert_eq!(ct.out, ut.out); @@ -954,7 +940,7 @@ mod tests { fn roundtrip(bnode: &BuilderNode) -> bool { let (addr, bytes) = compile(bnode); let node = Node::new(VERSION, addr, &bytes); - nodes_equal(&node, &bnode) + nodes_equal(&node, bnode) } fn trans(addr: CompiledAddr, inp: u8) -> Transition { diff --git a/src/raw/ops.rs b/src/raw/ops.rs index 6f0a09dd..3591a0c5 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -54,9 +54,16 @@ pub struct OpBuilder<'f> { } #[cfg(feature = "alloc")] +impl<'f> Default for OpBuilder<'f> { + fn default() -> Self { + Self::new() + } +} + impl<'f> OpBuilder<'f> { /// Create a new set operation builder. #[inline] + #[must_use] pub fn new() -> OpBuilder<'f> { OpBuilder { streams: vec![] } } @@ -99,6 +106,7 @@ impl<'f> OpBuilder<'f> { /// stream, which is an integer that is auto-incremented when a stream /// is added to this operation (starting at `0`). #[inline] + #[must_use] pub fn union(self) -> Union<'f> { Union { heap: StreamHeap::new(self.streams), @@ -117,6 +125,7 @@ impl<'f> OpBuilder<'f> { /// stream, which is an integer that is auto-incremented when a stream /// is added to this operation (starting at `0`). #[inline] + #[must_use] pub fn intersection(self) -> Intersection<'f> { Intersection { heap: StreamHeap::new(self.streams), @@ -141,6 +150,7 @@ impl<'f> OpBuilder<'f> { /// of `difference`, each yielded key contains exactly one `IndexValue` with /// `index` set to 0. #[inline] + #[must_use] pub fn difference(mut self) -> Difference<'f> { let first = self.streams.swap_remove(0); Difference { @@ -168,6 +178,7 @@ impl<'f> OpBuilder<'f> { /// stream, which is an integer that is auto-incremented when a stream /// is added to this operation (starting at `0`). #[inline] + #[must_use] pub fn symmetric_difference(self) -> SymmetricDifference<'f> { SymmetricDifference { heap: StreamHeap::new(self.streams), @@ -399,7 +410,7 @@ impl<'f> StreamHeap<'f> { } fn peek_is_duplicate(&self, key: &[u8]) -> bool { - self.heap.peek().map(|s| s.input() == key).unwrap_or(false) + self.heap.peek().is_some_and(|s| s.input() == key) } fn pop_if_equal(&mut self, key: &[u8]) -> Option { @@ -411,7 +422,7 @@ impl<'f> StreamHeap<'f> { } fn pop_if_le(&mut self, key: &[u8]) -> Option { - if self.heap.peek().map(|s| s.input() <= key).unwrap_or(false) { + if self.heap.peek().is_some_and(|s| s.input() <= key) { self.pop() } else { None @@ -468,11 +479,12 @@ impl Slot { } #[cfg(feature = "alloc")] +#[allow(clippy::non_canonical_partial_ord_impl)] impl PartialOrd for Slot { fn partial_cmp(&self, other: &Slot) -> Option { (&self.input, self.output) .partial_cmp(&(&other.input, other.output)) - .map(|ord| ord.reverse()) + .map(std::cmp::Ordering::reverse) } } diff --git a/src/raw/registry.rs b/src/raw/registry.rs index 8b314d4f..98d62214 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -57,12 +57,12 @@ impl Registry { // // In unscientific experiments, this provides the same compression // as `std::hash::SipHasher` but is much much faster. - const FNV_PRIME: u64 = 1099511628211; - let mut h = 14695981039346656037; - h = (h ^ (node.is_final as u64)).wrapping_mul(FNV_PRIME); + const FNV_PRIME: u64 = 1_099_511_628_211; + let mut h = 14_695_981_039_346_656_037; + h = (h ^ u64::from(node.is_final)).wrapping_mul(FNV_PRIME); h = (h ^ node.final_output.value()).wrapping_mul(FNV_PRIME); for t in &node.trans { - h = (h ^ (t.inp as u64)).wrapping_mul(FNV_PRIME); + h = (h ^ u64::from(t.inp)).wrapping_mul(FNV_PRIME); h = (h ^ t.out.value()).wrapping_mul(FNV_PRIME); h = (h ^ (t.addr as u64)).wrapping_mul(FNV_PRIME); } @@ -157,11 +157,11 @@ mod tests { } fn assert_insert_and_found(reg: &mut Registry, bnode: &BuilderNode) { - match reg.entry(&bnode) { + match reg.entry(bnode) { RegistryEntry::NotFound(cell) => cell.insert(1234), entry => panic!("unexpected not found entry, got: {:?}", entry), } - match reg.entry(&bnode) { + match reg.entry(bnode) { RegistryEntry::Found(addr) => assert_eq!(addr, 1234), entry => panic!("unexpected found entry, got: {:?}", entry), } diff --git a/src/raw/tests.rs b/src/raw/tests.rs index c9e77a5f..20966532 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -3,7 +3,7 @@ use crate::error::Error; use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; use crate::stream::Streamer; -const TEXT: &'static str = include_str!("./../../data/words-100000"); +const TEXT: &str = include_str!("./../../data/words-100000"); pub fn fst_set(ss: I) -> Fst> where @@ -15,7 +15,7 @@ where ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); ss.sort(); ss.dedup(); - for s in ss.iter().into_iter() { + for s in ss.iter() { bfst.add(s).unwrap(); } let fst = bfst.into_fst(); @@ -33,7 +33,7 @@ where ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect(); ss.sort(); ss.dedup(); - for (s, o) in ss.into_iter() { + for (s, o) in ss { bfst.insert(s, o).unwrap(); } bfst.into_fst() @@ -148,10 +148,10 @@ macro_rules! test_map_fail { test_map!(fst_map_only_empty1, "", 0); test_map!(fst_map_only_empty2, "", 100); -test_map!(fst_map_only_empty3, "", 9999999999); +test_map!(fst_map_only_empty3, "", 9_999_999_999); test_map!(fst_map_one1, "a", 0); test_map!(fst_map_one2, "a", 100); -test_map!(fst_map_one3, "a", 999999999); +test_map!(fst_map_one3, "a", 999_999_999); test_map!(fst_map_two, "a", 1, "b", 2); test_map!(fst_map_many1, "a", 34786, "ab", 26); test_map!( diff --git a/src/set.rs b/src/set.rs index a8add849..1e414a95 100644 --- a/src/set.rs +++ b/src/set.rs @@ -577,13 +577,13 @@ pub struct SetBuilder(raw::Builder); impl SetBuilder> { /// Create a builder that builds a set in memory. #[inline] - pub fn memory() -> SetBuilder> { + #[must_use] pub fn memory() -> SetBuilder> { SetBuilder(raw::Builder::memory()) } /// Finishes the construction of the set and returns it. #[inline] - pub fn into_set(self) -> Set> { + #[must_use] pub fn into_set(self) -> Set> { Set(self.0.into_fst()) } } @@ -859,10 +859,16 @@ where pub struct OpBuilder<'s>(raw::OpBuilder<'s>); #[cfg(feature = "alloc")] +impl<'s> Default for OpBuilder<'s> { + fn default() -> Self { + Self::new() + } +} + impl<'s> OpBuilder<'s> { /// Create a new set operation builder. #[inline] - pub fn new() -> OpBuilder<'s> { + #[must_use] pub fn new() -> OpBuilder<'s> { OpBuilder(raw::OpBuilder::new()) } @@ -913,7 +919,7 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]); /// ``` #[inline] - pub fn union(self) -> Union<'s> { + #[must_use] pub fn union(self) -> Union<'s> { Union(self.0.union()) } @@ -936,7 +942,7 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"a"]); /// ``` #[inline] - pub fn intersection(self) -> Intersection<'s> { + #[must_use] pub fn intersection(self) -> Intersection<'s> { Intersection(self.0.intersection()) } @@ -961,7 +967,7 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"b", b"c"]); /// ``` #[inline] - pub fn difference(self) -> Difference<'s> { + #[must_use] pub fn difference(self) -> Difference<'s> { Difference(self.0.difference()) } @@ -991,7 +997,7 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]); /// ``` #[inline] - pub fn symmetric_difference(self) -> SymmetricDifference<'s> { + #[must_use] pub fn symmetric_difference(self) -> SymmetricDifference<'s> { SymmetricDifference(self.0.symmetric_difference()) } } diff --git a/tests/test.rs b/tests/test.rs index 2e05b4d8..67ac153d 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -6,7 +6,7 @@ use fst_no_std::raw::{Builder, Fst}; use fst_no_std::set::Set; use fst_no_std::{self, Automaton, IntoStreamer, Streamer}; -static WORDS: &'static str = include_str!("../data/words-10000"); +static WORDS: &str = include_str!("../data/words-10000"); fn get_set() -> Set> { Set::from_iter(WORDS.lines()).unwrap() @@ -22,7 +22,7 @@ where let mut ss: Vec> = ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); ss.sort(); - for s in ss.iter().into_iter() { + for s in ss.iter() { bfst.add(s).unwrap(); } let fst = bfst.into_fst(); From ee92e5272ca3bf6c1b19df131f57d801ebe3f2ba Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:42:08 +0100 Subject: [PATCH 20/28] fmt --- src/map.rs | 21 ++++++++++++++------- src/raw/node.rs | 37 +++++++++++++++++++++++++------------ src/raw/registry_minimal.rs | 4 ++-- src/set.rs | 21 ++++++++++++++------- 4 files changed, 55 insertions(+), 28 deletions(-) diff --git a/src/map.rs b/src/map.rs index bd80f3bb..4e991afa 100644 --- a/src/map.rs +++ b/src/map.rs @@ -635,13 +635,15 @@ pub struct MapBuilder(raw::Builder); impl MapBuilder> { /// Create a builder that builds a map in memory. #[inline] - #[must_use] pub fn memory() -> MapBuilder> { + #[must_use] + pub fn memory() -> MapBuilder> { MapBuilder(raw::Builder::memory()) } /// Finishes the construction of the map and returns it. #[inline] - #[must_use] pub fn into_map(self) -> Map> { + #[must_use] + pub fn into_map(self) -> Map> { Map(self.0.into_fst()) } } @@ -996,7 +998,8 @@ impl<'m> Default for OpBuilder<'m> { impl<'m> OpBuilder<'m> { /// Create a new set operation builder. #[inline] - #[must_use] pub fn new() -> OpBuilder<'m> { + #[must_use] + pub fn new() -> OpBuilder<'m> { OpBuilder(raw::OpBuilder::new()) } @@ -1069,7 +1072,8 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - #[must_use] pub fn union(self) -> Union<'m> { + #[must_use] + pub fn union(self) -> Union<'m> { Union(self.0.union()) } @@ -1110,7 +1114,8 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - #[must_use] pub fn intersection(self) -> Intersection<'m> { + #[must_use] + pub fn intersection(self) -> Intersection<'m> { Intersection(self.0.intersection()) } @@ -1155,7 +1160,8 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - #[must_use] pub fn difference(self) -> Difference<'m> { + #[must_use] + pub fn difference(self) -> Difference<'m> { Difference(self.0.difference()) } @@ -1203,7 +1209,8 @@ impl<'m> OpBuilder<'m> { /// ]); /// ``` #[inline] - #[must_use] pub fn symmetric_difference(self) -> SymmetricDifference<'m> { + #[must_use] + pub fn symmetric_difference(self) -> SymmetricDifference<'m> { SymmetricDifference(self.0.symmetric_difference()) } } diff --git a/src/raw/node.rs b/src/raw/node.rs index 5db20b19..f808fc56 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -125,13 +125,15 @@ impl<'f> Node<'f> { /// Returns an iterator over all transitions in this node in lexicographic /// order. #[inline] - #[must_use] pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { + #[must_use] + pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { Transitions { node: self, range: 0..self.len() } } /// Returns the transition at index `i`. #[inline(always)] - #[must_use] pub fn transition(&self, i: usize) -> Transition { + #[must_use] + pub fn transition(&self, i: usize) -> Transition { // The `inline(always)` annotation on this function appears to // dramatically speed up FST traversal. In particular, measuring the // time it takes to run `fst range something-big.fst` shows almost a 2x @@ -165,7 +167,8 @@ impl<'f> Node<'f> { /// Returns the transition address of the `i`th transition. #[inline] - #[must_use] pub fn transition_addr(&self, i: usize) -> CompiledAddr { + #[must_use] + pub fn transition_addr(&self, i: usize) -> CompiledAddr { match self.state { State::OneTransNext(s) => { assert!(i == 0); @@ -184,7 +187,8 @@ impl<'f> Node<'f> { /// /// If no transition for this byte exists, then `None` is returned. #[inline] - #[must_use] pub fn find_input(&self, b: u8) -> Option { + #[must_use] + pub fn find_input(&self, b: u8) -> Option { match self.state { State::OneTransNext(s) if s.input(self) == b => Some(0), State::OneTransNext(_) => None, @@ -198,14 +202,16 @@ impl<'f> Node<'f> { /// If this node is final and has a terminal output value, then it is /// returned. Otherwise, a zero output is returned. #[inline] - #[must_use] pub fn final_output(&self) -> Output { + #[must_use] + pub fn final_output(&self) -> Output { self.final_output } /// Returns true if and only if this node corresponds to a final or "match" /// state in the finite state transducer. #[inline] - #[must_use] pub fn is_final(&self) -> bool { + #[must_use] + pub fn is_final(&self) -> bool { self.is_final } @@ -213,31 +219,36 @@ impl<'f> Node<'f> { /// /// The maximum number of transitions is 256. #[inline] - #[must_use] pub fn len(&self) -> usize { + #[must_use] + pub fn len(&self) -> usize { self.ntrans } /// Returns true if and only if this node has zero transitions. #[inline] - #[must_use] pub fn is_empty(&self) -> bool { + #[must_use] + pub fn is_empty(&self) -> bool { self.ntrans == 0 } /// Return the address of this node. #[inline] - #[must_use] pub fn addr(&self) -> CompiledAddr { + #[must_use] + pub fn addr(&self) -> CompiledAddr { self.start } #[doc(hidden)] #[inline] - #[must_use] pub fn as_slice(&self) -> &[u8] { + #[must_use] + pub fn as_slice(&self) -> &[u8] { &self.data[self.end..] } #[doc(hidden)] #[inline] - #[must_use] pub fn state(&self) -> &'static str { + #[must_use] + pub fn state(&self) -> &'static str { match self.state { State::OneTransNext(_) => "OTN", State::OneTrans(_) => "OT", @@ -261,7 +272,9 @@ impl<'f> Node<'f> { Ok(()) } else if node.trans.len() != 1 || node.is_final { StateAnyTrans::compile(wtr, addr, node) - } else if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() { + } else if node.trans[0].addr == last_addr + && node.trans[0].out.is_zero() + { StateOneTransNext::compile(wtr, addr, node.trans[0].inp) } else { StateOneTrans::compile(wtr, addr, node.trans[0]) diff --git a/src/raw/registry_minimal.rs b/src/raw/registry_minimal.rs index 207c6751..94f2dcc6 100644 --- a/src/raw/registry_minimal.rs +++ b/src/raw/registry_minimal.rs @@ -11,10 +11,10 @@ #![allow(dead_code)] #[cfg(feature = "std")] -use std::collections::hash_map::{Entry, HashMap}; -#[cfg(feature = "std")] use crate::raw::build::BuilderNode; use crate::raw::CompiledAddr; +#[cfg(feature = "std")] +use std::collections::hash_map::{Entry, HashMap}; #[derive(Debug)] #[cfg(feature = "std")] diff --git a/src/set.rs b/src/set.rs index 1e414a95..73c4a0fb 100644 --- a/src/set.rs +++ b/src/set.rs @@ -577,13 +577,15 @@ pub struct SetBuilder(raw::Builder); impl SetBuilder> { /// Create a builder that builds a set in memory. #[inline] - #[must_use] pub fn memory() -> SetBuilder> { + #[must_use] + pub fn memory() -> SetBuilder> { SetBuilder(raw::Builder::memory()) } /// Finishes the construction of the set and returns it. #[inline] - #[must_use] pub fn into_set(self) -> Set> { + #[must_use] + pub fn into_set(self) -> Set> { Set(self.0.into_fst()) } } @@ -868,7 +870,8 @@ impl<'s> Default for OpBuilder<'s> { impl<'s> OpBuilder<'s> { /// Create a new set operation builder. #[inline] - #[must_use] pub fn new() -> OpBuilder<'s> { + #[must_use] + pub fn new() -> OpBuilder<'s> { OpBuilder(raw::OpBuilder::new()) } @@ -919,7 +922,8 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]); /// ``` #[inline] - #[must_use] pub fn union(self) -> Union<'s> { + #[must_use] + pub fn union(self) -> Union<'s> { Union(self.0.union()) } @@ -942,7 +946,8 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"a"]); /// ``` #[inline] - #[must_use] pub fn intersection(self) -> Intersection<'s> { + #[must_use] + pub fn intersection(self) -> Intersection<'s> { Intersection(self.0.intersection()) } @@ -967,7 +972,8 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"b", b"c"]); /// ``` #[inline] - #[must_use] pub fn difference(self) -> Difference<'s> { + #[must_use] + pub fn difference(self) -> Difference<'s> { Difference(self.0.difference()) } @@ -997,7 +1003,8 @@ impl<'s> OpBuilder<'s> { /// assert_eq!(keys, vec![b"b", b"c", b"y", b"z"]); /// ``` #[inline] - #[must_use] pub fn symmetric_difference(self) -> SymmetricDifference<'s> { + #[must_use] + pub fn symmetric_difference(self) -> SymmetricDifference<'s> { SymmetricDifference(self.0.symmetric_difference()) } } From 7a32ddb69e713466841a15bb0b78fd7e2e9b877a Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:43:40 +0100 Subject: [PATCH 21/28] Update ci.yml --- .github/workflows/ci.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f287be67..3aa585ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,7 +27,7 @@ jobs: uses: dtolnay/.github/.github/workflows/pre_ci.yml@master test: - name: Rust ${{matrix.rust}} + name: Rust ${{matrix.name}} needs: pre_ci if: needs.pre_ci.outputs.continue runs-on: ubuntu-latest @@ -35,13 +35,17 @@ jobs: fail-fast: false matrix: include: - - rust: stable + - name: stable/std + rust: stable features: "" - - rust: nightly + - name: nightly/std + rust: nightly features: "" - - rust: nightly + - name: nightly/core + rust: nightly features: "--no-default-features" - - rust: nightly + - name: nightly/alloc + rust: nightly features: "--no-default-features --features alloc" timeout-minutes: 45 steps: From cba9a0aa28a74bdfe5cb08ad6a6bd05d09aa2cfb Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:48:03 +0100 Subject: [PATCH 22/28] Create deny.toml --- deny.toml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 deny.toml diff --git a/deny.toml b/deny.toml new file mode 100644 index 00000000..97050c87 --- /dev/null +++ b/deny.toml @@ -0,0 +1,30 @@ +# Target triples to include when checking. This is essentially our supported target list. +targets = [ + { triple = "x86_64-unknown-linux-gnu" }, + { triple = "aarch64-unknown-linux-gnu" }, + { triple = "x86_64-pc-windows-msvc" }, + { triple = "x86_64-apple-darwin" }, + { triple = "aarch64-apple-darwin" }, + { triple = "wasm32-unknown-unknown" }, +] + +all-features = true + +# This section is considered when running `cargo deny check licenses` +# More documentation for the licenses section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html +[licenses] +# List of explicitly allowed licenses +# See https://spdx.org/licenses/ for list of possible licenses +# [possible values: any SPDX 3.11 short identifier (+ optional exception)]. +allow = [ + "MIT", # requires license notice + "Unlicense" +] + +# This section is considered when running `cargo deny check bans`. +# More documentation about the 'bans' section can be found here: +# https://embarkstudios.github.io/cargo-deny/checks/bans/cfg.html +[bans] +# Warn when a crate version requirement is `*` +wildcards = "warn" \ No newline at end of file From b2021e9c30eaff69222a800640fb88157bc112e2 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:49:32 +0100 Subject: [PATCH 23/28] Update release-plz.yml --- .github/workflows/release-plz.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release-plz.yml b/.github/workflows/release-plz.yml index d7958128..71646e2c 100644 --- a/.github/workflows/release-plz.yml +++ b/.github/workflows/release-plz.yml @@ -11,7 +11,7 @@ concurrency: on: push: branches: - - main + - master jobs: release-plz: From 871511030b38aa5fb1233f47eabd3fcad439bc33 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 09:50:20 +0100 Subject: [PATCH 24/28] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 63b9a6ee..adf498fb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ fst no-std mode === -This is a fork of [fst](https://github.com/BurntSushi/fst) adding support for `no_std` targets (see [`no_std` usage](#no-std-usage) for details). +This is a fork of [fst](https://github.com/BurntSushi/fst) adding support for `no_std` targets (see [`no_std` usage](#no_std-usage) for details). If you're unsure whether to use this fork or the original one: Just use the original, chances are that's more up-to-date. From d6b6a9ee22a2d6739116bb2c64b2a67600135e00 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:25:18 +0000 Subject: [PATCH 25/28] chore: release Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- CHANGELOG.md | 301 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 +- 2 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..1462e1ce --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,301 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.4.7](https://github.com/crabnebula-dev/fst-no-std/releases/tag/fst-no-std-v0.4.7) - 2024-01-18 + +### Added +- no-std + +### Fixed +- fst-bin polish/clap/refactor +- fix tests +- fix [#1](https://github.com/crabnebula-dev/fst-no-std/pull/1) +- fix +- fix error + +### Other +- Update README.md +- Update release-plz.yml +- Create deny.toml +- Update ci.yml +- fmt +- clippy +- clippy +- cleanup +- Update Cargo.toml +- Update Cargo.toml +- Update Cargo.toml +- Create release-plz.yml +- Update ci.yml +- Update ci.yml +- Update ci.yml +- Delete fst-regex directory +- Delete fst-levenshtein directory +- Delete fst-bin directory +- Update Cargo.toml +- Update README.md +- Update README.md +- remove .idea files +- Update Cargo.toml +- fix it with updates +- use latest OS versions +- bump pinned to Rust 1.60.0 +- fst-bin-0.4.3 +- various updates +- remove 'num_cpus' and move to 'memmap2' +- upgrade to bstr 1.3 +- fix build +- switch to dtolnay/rust-toolchain +- do not build all docs on pinned Rust either +- don't run tests on pinned Rust version +- pin cross to v0.2.1 +- fst-bin-0.4.2 +- 0.4.7 +- fix 'cargo doc' generation again +- use 'cargo doc --all --no-deps' +- try to set 'doc = false' +- remove redundant check in 'compile' +- 0.4.6 +- remove 'inline' annotations +- make state limit configurable +- fix incorrect doc comment +- introduce the prefix flag to the fuzzy subcommand +- remove "apr" from example list +- update and fix some links +- update for changes in GitHub Actions +- clarify duplicate terminology +- 0.4.5 +- Fst::as_inner to give access to underlying data +- 0.4.4 +- add map_data method +- fst-0.4.3 +- add {Set,Map}::into_fst and Clone impls +- fst-0.4.2 +- add new accept_eof method +- upgrade to actions/checkout@v2 +- fst-0.4.1 +- add into_inner on raw::Fst +- fix typo +- replace `ignore` with `plain` for non code docs +- fix silly mistake +- fst-bin-0.4.1 +- remove use of Result::map_or +- update to regex-automata 0.1.9 +- depend on old fst +- remove deprecated crates from workspace +- 0.4.0 +- don't show underlying errors in Display impl +- add 'verify' sub-command +- bump FST version and support older FST versions +- remove AsRef<[u8]> from struct definitions +- more polishing work +- tweak inline annotations +- get rid of hack used before pub(crate) +- remove byteorder +- add CRC32C to every FST +- add "stream with state" APIs +- remove fst-bin from workspace +- add from_iter_{set,map} convenient constructors +- add get_key/get_key_into APIs +- add knobs for reverse and minimization +- drop docopt in favor of clap +- update dependencies and add some polish +- make FSTs generic over AsRef<[u8]> +- minor touchups +- specialize registry case of two cells +- switch to criterion for benchmarks +- move levenshtein, remove regex +- clean up all warnings and make things compile +- switch to Rust 2018 +- switch to GitHub Actions +- switch to rustfmt +- remove superfluous files +- add convenience for creating an FST from builder +- don't skip headers +- 0.3.5 +- add `Str` for exact strings +- 0.3.4 +- add Set/Map::from_static_slice +- 0.3.3 +- add as_bytes method +- explicitly point out that difference has 1 IndexValue +- make the image of billion strings appear +- bump minimum version +- fst-regex-0.2.2 +- add version tag +- add `mmap` feature +- add #[inline] to some public non-generic functions +- implement Clone and Debug on automatons +- 0.3.2 +- derive common traits +- tweak version +- update various dependencies +- add Subsequence +- Implement `Default` for Set and Map +- Updates README Cargo.toml example to say 0.3.0 +- fix example +- Use the SVG Travis badge +- 0.3.0 +- replace try! with ? +- bump minimum rustc version to 1.20 +- update to memmap 0.6 +- remove public dependency on memmap crate +- core performance improvements +- remove unnecessary `mut` binding +- add path to [[bin]] +- remove unnecessary mut binding +- remove comment +- 0.2.3 +- safely expose raw bytes of an FST +- 0.2.2 +- 0.2.1 +- update fst-bin to fst refactor +- split regexes and fuzzy search into separate crates +- comment fix +- Add syntax highlighting to README +- 0.1.38 +- don't run benchmarks in travis +- Run tests without default features. +- Fix output value bug. +- 0.1.37 +- Added impl From> for MmapReadOnly +- 0.1.36 +- add get_ref(), bytes_written() to MapBuilder/SetBuilder +- Fix transition iterator for nodes with 256 transitions +- len is the number of keys, not nodes. +- 0.1.35 +- Add Rust 1.7.0 to Travis. +- added mmap feature (default on) +- Minor edit to readme +- Rearrange and publish Automaton implementors. +- 0.1.34 +- Upgrade regex-syntax and memmap. +- 0.1.33 +- Add support for creating an Fst from a slice of Vec. +- remove pushes from Makefile +- 0.1.32 +- bump byteorder +- 0.1.31 +- Merge pull request [#17](https://github.com/crabnebula-dev/fst-no-std/pull/17) from fulmicoton/bugfix/14_slice_offset_len +- rustup +- 0.1.30 +- Fix set difference operation. +- 0.1.29 +- remove unused import +- 0.1.28 +- Closes [#12](https://github.com/crabnebula-dev/fst-no-std/pull/12). Implement From trait for Set and Map. +- Just test on current stable. +- Drop duplicate key errors. +- 0.1.27 +- Closes [#2](https://github.com/crabnebula-dev/fst-no-std/pull/2) Implement Clone and adds len() to MmapReadOnly +- 0.1.26 +- 0.1.25 +- version bump +- Add a new command to embed FSTs into Rust source files. +- Add tests for the automaton adapters +- Rename must_match to will_always_match +- Add various automaton adapters +- memory maps +- update readme finally +- 0.1.24 +- Don't use an a huge literal for usize. Just use u64 instead. +- tweaking and polishing +- Improve microbenchmarks. +- add Windows CI +- Some perf improvements to simple lookups. +- 0.1.23 +- 0.1.22 +- Tweak the transition numbers again. +- Add non-generic as_fst methods. +- 0.1.21 +- Represent 256 transitions as `0`, which always uses an extra byte. +- Max transitions is not representable by a u8, doy. +- Escape inputs so we produce valid dot in all cases. +- 0.1.20 +- Add size method. +- doc touchups +- 0.1.19 +- Tweak the mmap setup. +- Bump byteorder and stop using custom write_uint function. +- Don't use types yet. +- 0.1.18 +- 0.1.17 +- 0.1.16 +- Perf improvements. +- Allow tweaking of state labels. +- Add a silly rule for installing a bin. +- 0.1.15 +- Small reorg of nodes. +- Some addition utility methods for collecting streams. +- 0.1.14 +- be honest +- Fix a couple bugs with the checksum and the fst-bin merger. +- Compression gains. Polishing. +- various touchups +- Adding unsorted construction. +- 0.1.13 +- slight rearrangement +- Revamp command line utility to be more ergonomic/consistent. +- doc fixes +- 0.1.12 +- High level docs. +- typo +- 0.1.11 +- docs +- 0.1.10 +- Docs for regex. +- 0.1.9 +- Docs, polish, etc. +- 0.1.8 +- levenshtein poc automaton working +- Remove bound entirely. +- Weaken the bound on Automaton's associated type from Copy to Clone. +- Push Option into the associated type for Automaton. +- 0.1.7 +- Document map type. +- Lots of type renaming. +- 0.1.6 +- More docs, examples and starting on map facade. +- 0.1.5 +- minor correction +- minor correction +- 0.1.4 +- Polish, docs, tests, etc. +- 0.1.3 +- More progress on streams, sets, set operations and the facade. +- Introduce stream ops. +- 0.1.2 +- Impl stream +- Leave 1.0 and 1.1 behind. +- don't use unstable features +- 0.1.1 +- regexes are working +- Progress on a custom regex engine for searching FSTs. +- nail down byteorder dependency +- Re-work merge operations to use streams. +- intersection +- Start polishing work. Small optimizations. Start facade. +- Rename inner fst module to raw. +- Add a union sub-command. +- Add range queries to fst-bin. +- Give more information to the merge function. +- Pass reader index into callback. +- Range queries seem to work. +- Remove unused registry. +- Lots of progress. +- ignore +- Various progress and experiments. +- progress +- Progress on merging. Polishing some behavior. +- progress +- progress +- forgot to add +- Initial refactoring complete. Tests pass. Yay. +- progress +- initial working commit +- initial commit diff --git a/Cargo.toml b/Cargo.toml index b879670b..0f5edcfd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fst-no-std" -version = "0.4.7" #:version +version = "0.4.7" authors = ["Andrew Gallant ", "Jonas Kruckenberg "] description = """ Use finite state transducers to compactly represents sets or maps of many From 3fe9e8d463bf6a9aa51696a7fc23a3e8dbe0f1e9 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:39:34 +0100 Subject: [PATCH 26/28] no_std fixes --- src/map.rs | 1 + src/raw/mod.rs | 2 +- src/raw/ops.rs | 3 ++- src/set.rs | 1 + 4 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/map.rs b/src/map.rs index 4e991afa..7d4ae638 100644 --- a/src/map.rs +++ b/src/map.rs @@ -995,6 +995,7 @@ impl<'m> Default for OpBuilder<'m> { } } +#[cfg(feature = "alloc")] impl<'m> OpBuilder<'m> { /// Create a new set operation builder. #[inline] diff --git a/src/raw/mod.rs b/src/raw/mod.rs index b027bde7..e0062e82 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -1298,7 +1298,7 @@ where type Item = (&'a [u8], Output, A::State); fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> { - self.next_with(std::clone::Clone::clone) + self.next_with(Clone::clone) } } diff --git a/src/raw/ops.rs b/src/raw/ops.rs index 3591a0c5..be97e7dc 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -60,6 +60,7 @@ impl<'f> Default for OpBuilder<'f> { } } +#[cfg(feature = "alloc")] impl<'f> OpBuilder<'f> { /// Create a new set operation builder. #[inline] @@ -484,7 +485,7 @@ impl PartialOrd for Slot { fn partial_cmp(&self, other: &Slot) -> Option { (&self.input, self.output) .partial_cmp(&(&other.input, other.output)) - .map(std::cmp::Ordering::reverse) + .map(core::cmp::Ordering::reverse) } } diff --git a/src/set.rs b/src/set.rs index 73c4a0fb..440092bb 100644 --- a/src/set.rs +++ b/src/set.rs @@ -867,6 +867,7 @@ impl<'s> Default for OpBuilder<'s> { } } +#[cfg(feature = "alloc")] impl<'s> OpBuilder<'s> { /// Create a new set operation builder. #[inline] From cad13d5e202f63402c50e03c3e9ce461958bcd10 Mon Sep 17 00:00:00 2001 From: Jonas Kruckenberg <118265418+CrabNejonas@users.noreply.github.com> Date: Thu, 18 Jan 2024 11:43:57 +0100 Subject: [PATCH 27/28] fix: update docs --- src/lib.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index fb417522..86197449 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,8 @@ /*! +This is a fork of [fst](https://github.com/BurntSushi/fst) adding support for `no_std` targets (see [`no_std` usage](#no_std-usage) for details). + +If you're unsure whether to use this fork or the original one: Just use the original, chances are that's more up-to-date. + Crate `fst` is a library for efficiently storing and searching ordered sets or maps where the keys are byte strings. A key design goal of this crate is to support storing and searching *very large* sets or maps (i.e., billions). This @@ -20,7 +24,7 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: ```plain [dependencies] -fst = "0.4" +fst-no-std = "0.4" ``` The examples in this documentation will show the rest. From 76ccc0cff9b25e114c745aeb78d854e2c6cf3b01 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 18 Jan 2024 10:44:45 +0000 Subject: [PATCH 28/28] chore: release Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- CHANGELOG.md | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 +- 2 files changed, 298 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1462e1ce..076bcbe8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,303 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.4.8](https://github.com/crabnebula-dev/fst-no-std/compare/fst-no-std-v0.4.7...fst-no-std-v0.4.8) - 2024-01-18 + +### Added +- no-std + +### Fixed +- update docs +- fst-bin polish/clap/refactor +- fix tests +- fix [#1](https://github.com/crabnebula-dev/fst-no-std/pull/1) +- fix +- fix error + +### Other +- no_std fixes +- release +- Update README.md +- Update release-plz.yml +- Create deny.toml +- Update ci.yml +- fmt +- clippy +- clippy +- cleanup +- Update Cargo.toml +- Update Cargo.toml +- Update Cargo.toml +- Create release-plz.yml +- Update ci.yml +- Update ci.yml +- Update ci.yml +- Delete fst-regex directory +- Delete fst-levenshtein directory +- Delete fst-bin directory +- Update Cargo.toml +- Update README.md +- Update README.md +- remove .idea files +- Update Cargo.toml +- fix it with updates +- use latest OS versions +- bump pinned to Rust 1.60.0 +- fst-bin-0.4.3 +- various updates +- remove 'num_cpus' and move to 'memmap2' +- upgrade to bstr 1.3 +- fix build +- switch to dtolnay/rust-toolchain +- do not build all docs on pinned Rust either +- don't run tests on pinned Rust version +- pin cross to v0.2.1 +- fst-bin-0.4.2 +- 0.4.7 +- fix 'cargo doc' generation again +- use 'cargo doc --all --no-deps' +- try to set 'doc = false' +- remove redundant check in 'compile' +- 0.4.6 +- remove 'inline' annotations +- make state limit configurable +- fix incorrect doc comment +- introduce the prefix flag to the fuzzy subcommand +- remove "apr" from example list +- update and fix some links +- update for changes in GitHub Actions +- clarify duplicate terminology +- 0.4.5 +- Fst::as_inner to give access to underlying data +- 0.4.4 +- add map_data method +- fst-0.4.3 +- add {Set,Map}::into_fst and Clone impls +- fst-0.4.2 +- add new accept_eof method +- upgrade to actions/checkout@v2 +- fst-0.4.1 +- add into_inner on raw::Fst +- fix typo +- replace `ignore` with `plain` for non code docs +- fix silly mistake +- fst-bin-0.4.1 +- remove use of Result::map_or +- update to regex-automata 0.1.9 +- depend on old fst +- remove deprecated crates from workspace +- 0.4.0 +- don't show underlying errors in Display impl +- add 'verify' sub-command +- bump FST version and support older FST versions +- remove AsRef<[u8]> from struct definitions +- more polishing work +- tweak inline annotations +- get rid of hack used before pub(crate) +- remove byteorder +- add CRC32C to every FST +- add "stream with state" APIs +- remove fst-bin from workspace +- add from_iter_{set,map} convenient constructors +- add get_key/get_key_into APIs +- add knobs for reverse and minimization +- drop docopt in favor of clap +- update dependencies and add some polish +- make FSTs generic over AsRef<[u8]> +- minor touchups +- specialize registry case of two cells +- switch to criterion for benchmarks +- move levenshtein, remove regex +- clean up all warnings and make things compile +- switch to Rust 2018 +- switch to GitHub Actions +- switch to rustfmt +- remove superfluous files +- add convenience for creating an FST from builder +- don't skip headers +- 0.3.5 +- add `Str` for exact strings +- 0.3.4 +- add Set/Map::from_static_slice +- 0.3.3 +- add as_bytes method +- explicitly point out that difference has 1 IndexValue +- make the image of billion strings appear +- bump minimum version +- fst-regex-0.2.2 +- add version tag +- add `mmap` feature +- add #[inline] to some public non-generic functions +- implement Clone and Debug on automatons +- 0.3.2 +- derive common traits +- tweak version +- update various dependencies +- add Subsequence +- Implement `Default` for Set and Map +- Updates README Cargo.toml example to say 0.3.0 +- fix example +- Use the SVG Travis badge +- 0.3.0 +- replace try! with ? +- bump minimum rustc version to 1.20 +- update to memmap 0.6 +- remove public dependency on memmap crate +- core performance improvements +- remove unnecessary `mut` binding +- add path to [[bin]] +- remove unnecessary mut binding +- remove comment +- 0.2.3 +- safely expose raw bytes of an FST +- 0.2.2 +- 0.2.1 +- update fst-bin to fst refactor +- split regexes and fuzzy search into separate crates +- comment fix +- Add syntax highlighting to README +- 0.1.38 +- don't run benchmarks in travis +- Run tests without default features. +- Fix output value bug. +- 0.1.37 +- Added impl From> for MmapReadOnly +- 0.1.36 +- add get_ref(), bytes_written() to MapBuilder/SetBuilder +- Fix transition iterator for nodes with 256 transitions +- len is the number of keys, not nodes. +- 0.1.35 +- Add Rust 1.7.0 to Travis. +- added mmap feature (default on) +- Minor edit to readme +- Rearrange and publish Automaton implementors. +- 0.1.34 +- Upgrade regex-syntax and memmap. +- 0.1.33 +- Add support for creating an Fst from a slice of Vec. +- remove pushes from Makefile +- 0.1.32 +- bump byteorder +- 0.1.31 +- Merge pull request [#17](https://github.com/crabnebula-dev/fst-no-std/pull/17) from fulmicoton/bugfix/14_slice_offset_len +- rustup +- 0.1.30 +- Fix set difference operation. +- 0.1.29 +- remove unused import +- 0.1.28 +- Closes [#12](https://github.com/crabnebula-dev/fst-no-std/pull/12). Implement From trait for Set and Map. +- Just test on current stable. +- Drop duplicate key errors. +- 0.1.27 +- Closes [#2](https://github.com/crabnebula-dev/fst-no-std/pull/2) Implement Clone and adds len() to MmapReadOnly +- 0.1.26 +- 0.1.25 +- version bump +- Add a new command to embed FSTs into Rust source files. +- Add tests for the automaton adapters +- Rename must_match to will_always_match +- Add various automaton adapters +- memory maps +- update readme finally +- 0.1.24 +- Don't use an a huge literal for usize. Just use u64 instead. +- tweaking and polishing +- Improve microbenchmarks. +- add Windows CI +- Some perf improvements to simple lookups. +- 0.1.23 +- 0.1.22 +- Tweak the transition numbers again. +- Add non-generic as_fst methods. +- 0.1.21 +- Represent 256 transitions as `0`, which always uses an extra byte. +- Max transitions is not representable by a u8, doy. +- Escape inputs so we produce valid dot in all cases. +- 0.1.20 +- Add size method. +- doc touchups +- 0.1.19 +- Tweak the mmap setup. +- Bump byteorder and stop using custom write_uint function. +- Don't use types yet. +- 0.1.18 +- 0.1.17 +- 0.1.16 +- Perf improvements. +- Allow tweaking of state labels. +- Add a silly rule for installing a bin. +- 0.1.15 +- Small reorg of nodes. +- Some addition utility methods for collecting streams. +- 0.1.14 +- be honest +- Fix a couple bugs with the checksum and the fst-bin merger. +- Compression gains. Polishing. +- various touchups +- Adding unsorted construction. +- 0.1.13 +- slight rearrangement +- Revamp command line utility to be more ergonomic/consistent. +- doc fixes +- 0.1.12 +- High level docs. +- typo +- 0.1.11 +- docs +- 0.1.10 +- Docs for regex. +- 0.1.9 +- Docs, polish, etc. +- 0.1.8 +- levenshtein poc automaton working +- Remove bound entirely. +- Weaken the bound on Automaton's associated type from Copy to Clone. +- Push Option into the associated type for Automaton. +- 0.1.7 +- Document map type. +- Lots of type renaming. +- 0.1.6 +- More docs, examples and starting on map facade. +- 0.1.5 +- minor correction +- minor correction +- 0.1.4 +- Polish, docs, tests, etc. +- 0.1.3 +- More progress on streams, sets, set operations and the facade. +- Introduce stream ops. +- 0.1.2 +- Impl stream +- Leave 1.0 and 1.1 behind. +- don't use unstable features +- 0.1.1 +- regexes are working +- Progress on a custom regex engine for searching FSTs. +- nail down byteorder dependency +- Re-work merge operations to use streams. +- intersection +- Start polishing work. Small optimizations. Start facade. +- Rename inner fst module to raw. +- Add a union sub-command. +- Add range queries to fst-bin. +- Give more information to the merge function. +- Pass reader index into callback. +- Range queries seem to work. +- Remove unused registry. +- Lots of progress. +- ignore +- Various progress and experiments. +- progress +- Progress on merging. Polishing some behavior. +- progress +- progress +- forgot to add +- Initial refactoring complete. Tests pass. Yay. +- progress +- initial working commit +- initial commit + ## [0.4.7](https://github.com/crabnebula-dev/fst-no-std/releases/tag/fst-no-std-v0.4.7) - 2024-01-18 ### Added diff --git a/Cargo.toml b/Cargo.toml index 0f5edcfd..31ac4a4d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "fst-no-std" -version = "0.4.7" +version = "0.4.8" authors = ["Andrew Gallant ", "Jonas Kruckenberg "] description = """ Use finite state transducers to compactly represents sets or maps of many