ʻOhana
Population structure, admixture history, and selection using learning methods.
jade.rema.hpp
1 /* -------------------------------------------------------------------------
2  Ohana
3  Copyright (c) 2015-2020 Jade Cheng (\___/)
4  Jade Cheng <info@jade-cheng.com> (='.'=)
5  ------------------------------------------------------------------------- */
6 
7 #ifndef JADE_REMA_HPP__
8 #define JADE_REMA_HPP__
9 
10 #include "jade.args.hpp"
11 #include "jade.matrix.hpp"
12 
13 namespace jade
14 {
15  ///
16  /// A template for a class that reduces columns of matrices given to it.
17  /// The columns to keep are chosen at random, but their relative order is
18  /// not changed.
19  ///
20  template <typename TChar>
21  class basic_rema
22  {
23  public:
24  typedef TChar char_type; ///< A character.
25  typedef std::basic_istream<char_type> istream; ///< An input stream.
26  typedef std::basic_ostream<char_type> ostream; ///< An output stream.
27 
28  ///
29  /// Initializes a new instance of the class, reading the command-line
30  /// arguments.
31  ///
32  explicit basic_rema(
33  jade::args & a) ///< The command-line arguments.
34  : _engine ()
35  , _seed (_read_seed(a))
36  , _num_markers (_read_num_markers(a))
37  {
38  }
39 
40  ///
41  /// Executes the filter through the specified streams.
42  ///
43  void execute(
44  istream & in, ///< The input stream.
45  ostream & out) ///< The output stream.
46  {
47  //
48  // Read the dimensions of the matrix; this must be successful for
49  // the first matrix.
50  //
51  size_t row_count, col_count;
52  if (!(in >> row_count >> col_count))
53  throw jade::error("error reading matrix dimensions");
54 
55  //
56  // Do not try to keep more markers than the number of columns in
57  // the input matrix.
58  //
59  const auto marker_count = std::min(col_count, _num_markers);
60 
61  //
62  // Using the appropriate seed, randomly shuffle a vector that
63  // contains all the possible indices of the matrix; then record a
64  // set of bits to mark the columns to keep.
65  //
66  std::vector<bool> filter_flags;
67  filter_flags.resize(col_count, true);
68  {
69  typedef std::vector<size_t> vector_type;
70  vector_type indices (col_count);
71  std::iota(indices.begin(), indices.end(), 0);
72  _engine.seed(_seed);
73  std::shuffle(indices.begin(), indices.end(), _engine);
74  indices.resize(marker_count);
75  for (const auto index : indices)
76  filter_flags[index] = false;
77  }
78 
79  //
80  // Filter the first matrix.
81  //
82  _filter(in, out, row_count, col_count, marker_count, filter_flags);
83 
84  //
85  // Attempt to read a row; if this fails, assume this is the end of
86  // the data (a discrete genotype matrix), but otherwise, require
87  // the column, require consistent dimensions, and then read the
88  // second matrix.
89  //
90  size_t r, c;
91  if (!(in >> r) && in.eof())
92  return;
93  if (!(in >> c))
94  throw jade::error("error reading second matrix dimensions");
95  if (r != row_count || c != col_count)
96  throw jade::error("inconsistent second matrix dimensions");
97 
98  out << char_type('\n');
99  _filter(in, out, row_count, col_count, marker_count, filter_flags);
100 
101  //
102  // Require the third matrix of a consistent size.
103  //
104  if (!(in >> r >> c))
105  throw jade::error("error reading third matrix dimensions");
106  if (r != row_count || c != col_count)
107  throw jade::error("inconsistent third matrix dimensions");
108 
109  out << char_type('\n');
110  _filter(in, out, row_count, col_count, marker_count, filter_flags);
111 
112  //
113  // Require the end of the data.
114  //
115  for (;;)
116  {
117  const auto ch = in.get();
118  if (ch < 0)
119  return;
120  if (!std::isspace(ch))
121  throw jade::error("unexpected symbol after matrix data");
122  }
123  }
124 
125  private:
126  ///
127  /// The random number engine.
128  ///
129  typedef std::default_random_engine engine_type;
130 
131  ///
132  /// The random number generator seed type.
133  ///
134  typedef engine_type::result_type seed_type;
135 
136  // --------------------------------------------------------------------
137  void _filter(
138  istream & in,
139  ostream & out,
140  const size_t row_count,
141  const size_t col_count,
142  const size_t marker_count,
143  const std::vector<bool> & filter_flags)
144  {
145  out << row_count << char_type(' ')
146  << marker_count << char_type('\n');
147 
148  //
149  // Loop over the rows and columns of the input matrix.
150  //
151  for (size_t r = 0; r < row_count; r++)
152  {
153  size_t m = 0;
154  for (size_t c = 0; c < col_count; c++)
155  {
156  const auto is_unfiltered = !filter_flags[c];
157 
158  //
159  // Skip whitespace; do not allow an end of stream error.
160  //
161  int ch;
162  do
163  if ((ch = in.get()) < 0)
164  throw jade::error("unexpected end of matrix data");
165  while (std::isspace(ch));
166 
167  //
168  // Read all characters for the column, and write them to
169  // the output stream if this column is not discarded; allow
170  // an end of stream error in case the source stream does
171  // not end with an end-of-line character.
172  //
173  do
174  if (is_unfiltered)
175  out.put(static_cast<char_type>(ch));
176  while ((ch = in.get()) >= 0 && !std::isspace(ch));
177 
178  //
179  // Add white-space or the end-of-line after each unfiltered
180  // marker written to the output stream.
181  //
182  if (is_unfiltered)
183  out << (++m == _num_markers
184  ? char_type('\n')
185  : char_type('\t'));
186  }
187  }
188  }
189 
190  ///
191  /// Returns the number of markers.
192  /// \return The number of markers.
193  ///
194  static size_t _read_num_markers(
195  jade::args & a) ///< The command-line arguments.
196  {
197  auto n = a.pop<size_t>();
198  if (n == 0)
199  throw error("invalid number of markers");
200  return n;
201  }
202 
203  ///
204  /// Returns the random number seed.
205  /// \return The random number seed.
206  ///
207  static seed_type _read_seed(
208  jade::args & a) ///< The command-line arguments.
209  {
210  return a.read("--seed", "-s", std::random_device()());
211  }
212 
213  engine_type _engine; ///< The random number engine.
214  seed_type _seed; ///< The seed.
215  size_t _num_markers; ///< The number of markers to keep.
216  };
217 
218  typedef basic_rema<char> rema;
219 }
220 
221 #endif // JADE_REMA_HPP__
jade::basic_rema::char_type
TChar char_type
A character.
Definition: jade.rema.hpp:24
jade::basic_rema::basic_rema
basic_rema(jade::args &a)
Initializes a new instance of the class, reading the command-line arguments.
Definition: jade.rema.hpp:32
jade::basic_args::pop
TValue pop()
Definition: jade.args.hpp:96
jade::basic_rema
A template for a class that reduces columns of matrices given to it. The columns to keep are chosen a...
Definition: jade.rema.hpp:22
jade::basic_args::read
TValue read(char_type const *const long_name, char_type const *const short_name, const TValue fallback=TValue())
Reads and removes an option with one argument. If the option was not specified, then the fallback val...
Definition: jade.args.hpp:117
jade::basic_args
A template for a class that helps process command-line arguments.
Definition: jade.args.hpp:19
jade::basic_rema::ostream
std::basic_ostream< char_type > ostream
An output stream.
Definition: jade.rema.hpp:26
jade::basic_rema::istream
std::basic_istream< char_type > istream
An input stream.
Definition: jade.rema.hpp:25
jade::basic_error
A template for a class representing an exception thrown from this namespace.
Definition: jade.error.hpp:20
jade::basic_rema::execute
void execute(istream &in, ostream &out)
Executes the filter through the specified streams.
Definition: jade.rema.hpp:43