ʻOhana
Population structure, admixture history, and selection using learning methods.
qpas/jade.forced_grouping.hpp
1 /* -------------------------------------------------------------------------
2  Ohana
3  Copyright (c) 2015-2020 Jade Cheng (\___/)
4  Jade Cheng <info@jade-cheng.com> (='.'=)
5  ------------------------------------------------------------------------- */
6 
7 #ifndef JADE_FORCED_GROUPING_HPP__
8 #define JADE_FORCED_GROUPING_HPP__
9 
10 #include "jade.randomizer.hpp"
11 
12 namespace jade
13 {
14  ///
15  /// A template for a class that implements the forced grouping feature.
16  ///
17  template <typename TValue>
18  class basic_forced_grouping
19  {
20  public:
21  /// The value type.
22  typedef TValue value_type;
23 
24  /// The matrix type.
26 
27  /// The randomizer type.
29 
30  ///
31  /// Initializes a new instance of the class.
32  ///
34  : _a ()
35  , _b ()
36  , _i (0)
37  , _k (0)
38  {
39  }
40 
41  ///
42  /// Initializes a new instance of the class based on values from the
43  /// specified file.
44  ///
46  char const * const path) ///< The path to the file.
47  : _a ()
48  , _b ()
49  , _i ()
50  , _k ()
51  {
52  assert(path != nullptr);
53  try
54  {
55  std::istringstream in (_strip_comments(path));
56 
57  if (!(in >> _i) || _i == 0)
58  throw error()
59  << "error parsing number of individuals";
60 
61  if (!(in >> _k) || _k == 0)
62  throw error()
63  << "error parsing number of components";
64 
65  _a.reserve(_i);
66 
67  for (size_t i = 0; i < _i; i++)
68  {
69  size_t value;
70 
71  if (!(in >> value))
72  throw error()
73  << "error parsing component assignment "
74  << "for individual " << i+1;
75 
76  _a.push_back(value);
77  }
78 
79  const auto P = 1 + *std::max_element(_a.begin(), _a.end());
80 
81  _b.reserve(P);
82 
83  for (size_t p = 0; p < P; p++)
84  {
85  try
86  {
87  _b.emplace_back(in);
88  }
89  catch (const std::exception & e)
90  {
91  throw error()
92  << "error reading B vector for population index "
93  << p << ": " << e.what();
94  }
95  }
96 
97  _validate(in);
98  }
99  catch (const std::exception & e)
100  {
101  throw error() << "failed to read forced-grouping file '"
102  << path << "': " << e.what();
103  }
104  }
105 
106  ///
107  /// Initializes a new instance of the class based on values from the
108  /// specified file.
109  ///
110  inline explicit basic_forced_grouping(
111  const std::string & path) ///< The path to the file.
112  : basic_forced_grouping(path.c_str())
113  {
114  }
115 
116  ///
117  /// \return The number of individuals.
118  ///
119  inline size_t get_i() const
120  {
121  return _i;
122  }
123 
124  ///
125  /// \return The number of components.
126  ///
127  inline size_t get_k() const
128  {
129  return _k;
130  }
131 
132  ///
133  /// \return The maximum value for a specified individual and component.
134  ///
136  const size_t i, ///< The individual index.
137  const size_t k) ///< The component index.
138  const
139  {
140  assert(i < _i);
141  assert(k < _k);
142  return _b[_a[i]].get_value(k + _k);
143  }
144 
145  ///
146  /// \return The minimim value for a specified individual and component.
147  ///
149  const size_t i, ///< The individual index.
150  const size_t k) ///< The component index.
151  const
152  {
153  assert(i < _i);
154  assert(k < _k);
155  return _b[_a[i]].get_value(k);
156  }
157 
158  ///
159  /// \return A new Q matrix from random values.
160  ///
162  randomizer_type & rnd) ///< The randomizer.
163  const
164  {
165  static const auto epsilon = value_type(0.000001);
166 
167  matrix_type q (_i, _k);
168 
169  for (size_t k = 0; k < _k; k++)
170  for (size_t i = 0; i < _i; i++)
171  q(i, k) = _lerp(get_min(i, k), get_max(i, k), q(i, k));
172 
173  typedef std::uniform_int_distribution<size_t> k_dist_type;
174  typedef std::uniform_real_distribution<value_type> q_dist_type;
175 
176  k_dist_type k_dist (0, _k - 1);
177 
178  auto & engine = rnd.get_engine();
179 
180  for (size_t i = 0; i < _i; i++)
181  {
182  for (;;)
183  {
184  const auto row_sum = q.get_row_sum(i);
185  if (std::fabs(value_type(1) - row_sum) < epsilon)
186  break;
187 
188  const auto k = k_dist(engine);
189  const auto q_ik = q(i, k);
190 
191  if (row_sum > value_type(1))
192  {
193  const auto min_ik = get_min(i, k);
194  const auto distance = row_sum - value_type(1);
195  const auto boundary = std::max(min_ik, q_ik - distance);
196 
197  q_dist_type q_dist (boundary, q_ik);
198  q(i, k) = q_dist(engine);
199  }
200  else
201  {
202  const auto max_ik = get_max(i, k);
203  const auto distance = value_type(1) - row_sum;
204  const auto boundary = std::min(q_ik + distance, max_ik);
205 
206  q_dist_type q_dist (q_ik, boundary);
207  q(i, k) = q_dist(engine);
208  }
209  }
210  }
211 
212  return q;
213  }
214 
215  ///
216  /// Validates the Q matrix and throws an exception if validation fails.
217  /// \return True.
218  ///
220  const matrix_type & q) ///< The Q matrix.
221  const
222  {
223  if (_k != q.get_width())
224  throw error()
225  << "inconsistent number of components specified in "
226  << "forced-grouping file (" << _k << ") and "
227  << q.get_size_str() << " Q matrix";
228 
229  if (_i != q.get_height())
230  throw error()
231  << "inconsistent number of individuals specified in "
232  << "forced-grouping file (" << _i << ") and "
233  << q.get_size_str() << " Q matrix";
234 
235  for (size_t i = 0; i < _i; i++)
236  {
237  for (size_t k = 0; k < _k; k++)
238  {
239  const auto q_ik = q(i, k);
240  const auto min = get_min(i, k);
241  const auto max = get_max(i, k);
242  if (q_ik >= min && q_ik <= max)
243  continue;
244 
245  throw error()
246  << "inconsistent Q matrix cell [" << i+1 << "," << k+1
247  << "] (" << q_ik << ") is outside the range specified "
248  << "in the forced-grouping file " << min << " to "
249  << max;
250  }
251  }
252 
253  return true;
254  }
255 
256  private:
257  // ------------------------------------------------------------------------
258  static value_type _lerp(
259  const value_type min,
260  const value_type max,
261  const value_type percent)
262  {
263  assert(min >= value_type(0));
264  assert(max <= value_type(1));
265  assert(percent >= value_type(0) && percent <= value_type(1));
266 
267  return min + (max - min) * percent;
268  }
269 
270  // --------------------------------------------------------------------
271  static std::string _strip_comments(const char * const path)
272  {
273  assert(path != nullptr);
274 
275  std::ifstream in (path);
276  if (!in.good())
277  throw jade::error() << "error opening file";
278 
279  std::ostringstream out;
280 
281  for (auto n = 1;; n++)
282  {
283  std::string line;
284  std::getline(in, line);
285 
286  if (!line.empty() && line[0] != '#')
287  out << line << std::endl;
288 
289  if (in.eof())
290  break;
291 
292  if (!in.good())
293  throw jade::error() << "error reading file at line " << n;
294  }
295 
296  return out.str();
297  }
298 
299  // ------------------------------------------------------------------------
300  void _validate(std::istringstream & in) const
301  {
302  std::string tmp;
303  if (in >> tmp)
304  throw error()
305  << "invalid token encountered at end of file: " << tmp;
306 
307  if (_i < 2)
308  throw error()
309  << "invalid number of individuals: " << _i
310  << "; expected at least 2";
311 
312  if (_k < 2)
313  throw error()
314  << "invalid number of components: " << _k
315  << "; expected at least 2";
316 
317  const auto k2 = 2 * _k;
318 
319  for (size_t k = 0; k < _k; k++)
320  {
321  const auto & b_k = _b[k];
322 
323  if (b_k.get_height() != k2 || b_k.get_width() != 1)
324  throw error()
325  << "invalid B vector for population index " << k
326  << ": size " << b_k.get_size_str()
327  << " does not match expected [" << k2 << "x1]";
328 
329  auto min_sum = value_type(0);
330  for (size_t kk = 0; kk < _k; kk++)
331  min_sum += b_k[kk];
332  if (min_sum > value_type(1))
333  throw error()
334  << "invalid B vector for population index " << k
335  << ": the sum of the first " << _k
336  << " values is greater than 1";
337 
338  auto max_sum = value_type(0);
339  for (size_t kk = 0; kk < _k; kk++)
340  max_sum += b_k[_k + kk];
341  if (max_sum < value_type(1))
342  throw error()
343  << "invalid B vector for population index " << k
344  << ": the sum of the last " << _k
345  << " values is less than 1";
346 
347  for (size_t kk = 0; kk < k2; kk++)
348  if (b_k[kk] < value_type(0) || b_k[kk] > value_type(1))
349  throw error()
350  << "invalid B vector for population index " << k
351  << ": cell " << kk+1 << " (" << b_k[kk]
352  << ") is not between 0 and 1";
353 
354  for (size_t kk = 0; kk < _k; kk++)
355  if (!(b_k[kk] <= b_k[_k + kk]))
356  throw error()
357  << "invalid B vector for population " << k
358  << ": cell " << kk+1 << " (" << b_k[kk]
359  << ") is greater than cell " << _k+kk+1 << " ("
360  << b_k[_k + kk] << ")";
361  }
362  }
363 
364  std::vector<size_t> _a; // [K] component assignments
365  std::vector<matrix_type> _b; // [K][2K x 1] proportion ranges
366  size_t _i; // number of individuals
367  size_t _k; // number of components
368  };
369 }
370 
371 #endif // JADE_FORCED_GROUPING_HPP__
jade::basic_error::str
const string_type & str() const
Definition: jade.error.hpp:58
jade::basic_randomizer< value_type >
jade::basic_forced_grouping::get_k
size_t get_k() const
Definition: qpas/jade.forced_grouping.hpp:127
jade::basic_randomizer::get_engine
std::default_random_engine & get_engine()
Definition: jade.randomizer.hpp:42
jade::basic_forced_grouping::get_max
value_type get_max(const size_t i, const size_t k) const
Definition: qpas/jade.forced_grouping.hpp:135
jade::basic_forced_grouping::matrix_type
basic_matrix< value_type > matrix_type
The matrix type.
Definition: qpas/jade.forced_grouping.hpp:25
jade::basic_matrix::get_width
size_t get_width() const
Definition: jade.matrix.hpp:757
jade::basic_forced_grouping::basic_forced_grouping
basic_forced_grouping()
Initializes a new instance of the class.
Definition: qpas/jade.forced_grouping.hpp:33
jade::basic_error::what
virtual const char * what() const
jade::basic_forced_grouping::get_min
value_type get_min(const size_t i, const size_t k) const
Definition: qpas/jade.forced_grouping.hpp:148
jade::basic_forced_grouping::basic_forced_grouping
basic_forced_grouping(char const *const path)
Initializes a new instance of the class based on values from the specified file.
Definition: qpas/jade.forced_grouping.hpp:45
jade::basic_forced_grouping::randomizer_type
basic_randomizer< value_type > randomizer_type
The randomizer type.
Definition: qpas/jade.forced_grouping.hpp:28
jade::basic_matrix::get_size_str
std::string get_size_str() const
Definition: jade.matrix.hpp:735
jade::basic_forced_grouping::value_type
TValue value_type
The value type.
Definition: qpas/jade.forced_grouping.hpp:22
jade::basic_matrix::get_height
size_t get_height() const
Definition: jade.matrix.hpp:603
jade::basic_forced_grouping::randomize_q
matrix_type randomize_q(randomizer_type &rnd) const
Definition: qpas/jade.forced_grouping.hpp:161
jade::basic_forced_grouping::get_i
size_t get_i() const
Definition: qpas/jade.forced_grouping.hpp:119
jade::basic_forced_grouping::validate_q
bool validate_q(const matrix_type &q) const
Validates the Q matrix and throws an exception if validation fails.
Definition: qpas/jade.forced_grouping.hpp:219
jade::basic_matrix::get_row_sum
value_type get_row_sum(const size_t row) const
Definition: jade.matrix.hpp:716
jade::basic_error
A template for a class representing an exception thrown from this namespace.
Definition: jade.error.hpp:20
jade::basic_forced_grouping
A template for a class that implements the forced grouping feature.
Definition: cpax/jade.forced_grouping.hpp:19
jade::basic_matrix< value_type >
jade::basic_forced_grouping::basic_forced_grouping
basic_forced_grouping(const std::string &path)
Initializes a new instance of the class based on values from the specified file.
Definition: qpas/jade.forced_grouping.hpp:110