Raptor 3.0.0-rc.1
A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences
 
cutoff.hpp
Go to the documentation of this file.
1// --------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
6// --------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/io/sequence_file/format_fasta.hpp>
16
18
19namespace raptor
20{
21
22class cutoff
23{
24public:
25 cutoff() = default;
26 cutoff(cutoff const &) = default;
27 cutoff & operator=(cutoff const &) = default;
28 cutoff(cutoff &&) = default;
29 cutoff & operator=(cutoff &&) = default;
30 ~cutoff() = default;
31
32 cutoff(prepare_arguments const & arguments) : fixed_cutoff{arguments.kmer_count_cutoff}
33 {
34 if (arguments.use_filesize_dependent_cutoff)
35 {
36 cutoff_kind = cutoff_kinds::filesize_dependent;
37 }
38 else
39 {
40 cutoff_kind = cutoff_kinds::fixed;
41 }
42 }
43
44 uint8_t get(std::filesystem::path const & filename) const noexcept
45 {
46 switch (cutoff_kind)
47 {
48 case cutoff_kinds::filesize_dependent:
49 return impl(filename);
50 default:
51 {
52 assert(cutoff_kind == cutoff_kinds::fixed);
53 return fixed_cutoff;
54 }
55 }
56 }
57
58 static inline bool file_is_compressed(std::filesystem::path const & filepath)
59 {
60 std::filesystem::path const extension = filepath.extension();
61 return extension == ".gz" || extension == ".bgzf" || extension == ".bz2";
62 }
63
64private:
65 enum class cutoff_kinds
66 {
67 fixed,
68 filesize_dependent
69 };
70
71 cutoff_kinds cutoff_kind{cutoff_kinds::fixed};
72
73 uint8_t fixed_cutoff{};
74 // Cutoffs and bounds from Mantis
75 // Mantis ignores k-mers which appear less than a certain cutoff. The cutoff is based on the file size of a
76 // gzipped fastq file. Small files have only a cutoff of 1 while big files have a cutoff value of 50.
77 // https://doi.org/10.1016/j.cels.2018.05.021
78 // Supplement Table S1
79 // https://www.cell.com/cms/10.1016/j.cels.2018.05.021/attachment/0a3d402b-8b90-42c0-a709-22f246fd1759/mmc1.pdf
80 static constexpr std::array<uint8_t, 4> const cutoffs{1u, 3u, 10u, 20u};
81 static constexpr std::array<uint64_t, 4> const cutoff_bounds{314'572'800ULL,
82 524'288'000ULL,
83 1'073'741'824ULL,
84 3'221'225'472ULL};
85
86 uint8_t impl(std::filesystem::path const & filename) const
87 {
88 bool const is_compressed = file_is_compressed(filename);
89 bool const is_fasta = check_for_fasta_format(filename);
90
91 // Since the curoffs are based on the filesize of a gzipped fastq file, we try account for the other cases:
92 // We multiply by two if we have fasta input.
93 // We divide by 3 if the input is not compressed.
94 size_t const filesize = std::filesystem::file_size(filename) * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
95
96 uint8_t cutoff{50u};
97 for (size_t i = 0; i < cutoff_bounds.size(); ++i)
98 {
99 if (filesize <= cutoff_bounds[i])
100 {
101 cutoff = cutoffs[i];
102 break;
103 }
104 }
105
106 return cutoff;
107 }
108
109 static inline bool
110 check_for_fasta_format(std::filesystem::path const & filepath,
111 std::vector<std::string> const & valid_extensions = seqan3::format_fasta::file_extensions)
112 {
113 std::string const extension = file_is_compressed(filepath) ? filepath.stem() : filepath.extension();
114
115 auto case_insensitive_string_ends_with = [&](std::string_view str, std::string_view suffix)
116 {
117 size_t const suffix_length{suffix.size()};
118 size_t const str_length{str.size()};
119 return suffix_length > str_length ? false
120 : std::ranges::equal(str.substr(str_length - suffix_length),
121 suffix,
122 [](char const chr1, char const chr2)
123 {
124 return std::tolower(chr1) == std::tolower(chr2);
125 });
126 };
127
128 auto case_insensitive_ends_with = [&](std::string const & ext)
129 {
130 return case_insensitive_string_ends_with(extension, ext);
131 };
132
133 return std::ranges::find_if(valid_extensions, case_insensitive_ends_with) != valid_extensions.end();
134 }
135};
136
137} // namespace raptor
Definition: cutoff.hpp:23
T equal(T... args)
T extension(T... args)
T file_size(T... args)
T find_if(T... args)
Provides raptor::prepare_arguments.
T size(T... args)
Definition: prepare_arguments.hpp:28
T substr(T... args)