diff --git a/deduplication/__main__.py b/deduplication/__main__.py index ee61c36..249ffe4 100644 --- a/deduplication/__main__.py +++ b/deduplication/__main__.py @@ -6,20 +6,20 @@ if args.mode == "bloom": if args.single: assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list" - dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing) + dedup_single_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear) elif args.multi: - dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing) + dedup_multi_bloom(args.input, args.minhash_dir, args.num, args.fp, args.output_file, args.name, args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear) else: assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list" - dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, not args.skip_minhashing) + dedup_single_file_bloom(args.input[0], args.minhash_dir[0], args.num, args.fp, args.output_file, args.name[0], args.sim_threshold, args.num_perm, args.save_dir, args.skip_minhashing, clear=args.clear) else: if args.single: assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list" - dedup_single_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing) + dedup_single_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing) elif args.multi: - dedup_multi_lsh(args.input, args.minhash_dir, args.output_file, args.name, args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing) + dedup_multi_lsh(args.input, args.minhash_dir, args.output_file, args.name, args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing) else: assert len(args.input) == 1 and len(args.minhash_dir) == 1 and len(args.name) == 1, "Expected single input argument but got a list" - dedup_single_file_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, compute_minhashes=not args.skip_minhashing) + dedup_single_file_lsh(args.input[0], args.minhash_dir[0], args.output_file, args.name[0], args.sim_threshold, args.num_perm, redis_port=args.redis_port, skip_minhashing=args.skip_minhashing) diff --git a/deduplication/args.py b/deduplication/args.py index ce7e792..d1c6006 100644 --- a/deduplication/args.py +++ b/deduplication/args.py @@ -51,11 +51,13 @@ def parse_args(): "--sim-threshold", help="Jaccard Similarity threshold for deduplication, should be in [0, 1]. Default is 0.8", default=0.8, + type=float, ) parser.add_argument( "--num-perm", help="Number of hash functions for MinHashing. Default is 128", default=128, + type=int, ) parser.add_argument( "--mode", diff --git a/deduplication/workflows.py b/deduplication/workflows.py index 3176818..e0d17c1 100644 --- a/deduplication/workflows.py +++ b/deduplication/workflows.py @@ -17,7 +17,7 @@ def dedup_single_lsh( n_hash_funcs: int = 128, redis_name: str = b"tpc", redis_port: int = 6379, - compute_minhashes: bool = True, + skip_minhashing: bool = False, ): lsh_params = { "threshold": sim_threshold, @@ -29,7 +29,7 @@ def dedup_single_lsh( }, } - if compute_minhashes: + if not skip_minhashing: m = MinHasher(input_dir, minhash_dir, n_hash_funcs) m.process() @@ -48,7 +48,7 @@ def dedup_multi_lsh( n_hash_funcs: int = 128, redis_name: str = b"tpc", redis_port: int = 6379, - compute_minhashes: bool = True, + skip_minhashing: bool = False, ): assert len(input_dirs) == len(minhash_dirs) == len(corpus_names), \ f"Expected len(input_dirs) == len(minhash_dirs) == len(corpus_names), got {len(input_dirs)}, {len(minhash_dirs)}, {len(corpus_names)}" @@ -63,7 +63,7 @@ def dedup_multi_lsh( n_hash_funcs, redis_name, redis_port, - compute_minhashes, + skip_minhashing, ) @@ -76,7 +76,7 @@ def dedup_single_file_lsh( n_hash_funcs: int = 128, redis_name: str = b"tpc", redis_port: int = 6379, - compute_minhashes: bool = True, + skip_minhashing: bool = False, ): lsh_params = { "threshold": sim_threshold, @@ -88,7 +88,7 @@ def dedup_single_file_lsh( }, } - if compute_minhashes: + if not skip_minhashing: m = MinHasher(None, minhash_dir, n_hash_funcs) m.compute_minhash_for_file(input_file) @@ -105,6 +105,7 @@ def clear_dir(save_dir): if os.path.exists(save_dir): rm_files = [os.path.join(save_dir, f) for f in os.listdir(save_dir) if ".bf" in f or '.csv' in f] for f in rm_files: + print(f"Clearing {f}...") os.remove(f) @@ -119,7 +120,7 @@ def dedup_single_bloom( sim_threshold: float = 0.8, n_hash_funcs: int = 128, save_dir: str = "./", - compute_minhashes: bool = True, + skip_minhashing: bool = False, clear: bool = False, ): if clear: @@ -133,7 +134,7 @@ def dedup_single_bloom( "save_dir": save_dir } - if compute_minhashes: + if not skip_minhashing: m = MinHasher(input_dir, minhash_dir, n_hash_funcs) m.process() @@ -153,7 +154,7 @@ def dedup_multi_bloom( sim_threshold: float = 0.8, n_hash_funcs: int = 128, save_dir: str = "./", - compute_minhashes: bool = True, + skip_minhashing: bool = False, clear: bool = False, ): assert len(input_dirs) == len(minhash_dirs) == len(corpus_names), \ @@ -173,7 +174,7 @@ def dedup_multi_bloom( sim_threshold, n_hash_funcs, save_dir, - compute_minhashes, + skip_minhashing, clear=False ) @@ -187,7 +188,7 @@ def dedup_single_file_bloom( sim_threshold: float = 0.8, n_hash_funcs: int = 128, save_dir: str = "./", - compute_minhashes: bool = True, + skip_minhashing: bool = False, clear: bool = False, ): if clear: @@ -201,7 +202,7 @@ def dedup_single_file_bloom( "save_dir": save_dir } - if compute_minhashes: + if not skip_minhashing: m = MinHasher(None, minhash_dir, n_hash_funcs) m.compute_minhash_for_file(input_file)