ʻOhana
Population structure, admixture history, and selection using learning methods.
jade.neoscan.hpp
1 /* -------------------------------------------------------------------------
2  Ohana
3  Copyright (c) 2015-2020 Jade Cheng (\___/)
4  Jade Cheng <info@jade-cheng.com> (='.'=)
5  ------------------------------------------------------------------------- */
6 
7 #ifndef JADE_NEOSCAN_HPP__
8 #define JADE_NEOSCAN_HPP__
9 
10 #include "jade.args.hpp"
11 #include "jade.genotype_matrix_factory.hpp"
12 
13 namespace jade
14 {
15  ///
16  /// A template for a class implementing the neoscan algorithm. This
17  /// algorithm scans for positive selection between ancient and modern data.
18  /// It can take advantage of the dating for each individual ancient sample.
19  ///
20  template <typename TValue>
22  {
23  basic_neoscan() = delete;
24  basic_neoscan(const basic_neoscan &) = delete;
25  basic_neoscan & operator = (const basic_neoscan &) = delete;
26 
27  public:
28  /// The value type.
29  typedef TValue value_type;
30 
31  /// The matrix type.
33 
34  /// The verification type.
36 
37  /// The genotype matrix type.
39 
40  /// The genotype matrix factory type.
41  typedef
44 
45  ///
46  /// Output for one column.
47  ///
48  struct output
49  {
50  value_type delta; ///< The delta.
51  value_type global_lle; ///< The global lle.
52  value_type local_lle; ///< The local lle.
53 
54  ///
55  /// \return The computed log-likelihood ratio, i.e.
56  /// 2.0 * (local_lle - global_lle).
57  ///
59  {
60  return value_type(2.0) * (local_lle - global_lle);
61  }
62 
63  ///
64  /// Returns the delta, global_lle, local_lle, and computed
65  /// log-likelihood ratio converted to high-precision strings and
66  /// joined by the tab character. The returned string does not
67  /// include and end-of-line character.
68  ///
69  /// \return A string representing the output values.
70  ///
71  std::string to_string() const
72  {
73  std::ostringstream out;
75  out << std::showpos
76  << delta << '\t'
77  << global_lle << '\t'
78  << local_lle << '\t'
79  << compute_lle_ratio();
80  return out.str();
81  }
82  };
83 
84  ///
85  /// Initializes a new instance of the class. Memory for the specified
86  /// G, Q, and Fa matrices must remain valid for the lifetime of this
87  /// instance.
88  ///
90  const genotype_matrix_type & g, ///< The G matrix.
91  const matrix_type & q, ///< The Q matrix.
92  const matrix_type & f, ///< The F matrix.
93  const matrix_type & years) ///< The years matrix.
94  : _g (g)
95  , _q (q)
96  , _f (f)
97  , _y (_init_y(years, q))
98  , _f_j (f.get_height(), 1)
99  {
101  }
102 
103  ///
104  /// Executes the algorithm using the specified action for the output
105  /// of each column.
106  ///
107  /// \param output_action The action to perform for the output; this
108  /// function or lambda must take one argument of
109  /// type, output.
110  ///
111  template <typename TOutputAction>
112  void execute(const TOutputAction & output_action) const
113  {
114  const auto J = _g.get_width();
115 
116  for (size_t j = 0; j < J; j++)
117  {
118  value_type col_min = 0;
119  value_type col_max = 0;
120  _f.get_min_max_column(j, col_min, col_max);
121 
122  const auto range_low = -col_max;
123  const auto range_high = value_type(1.0) - col_min;
124 
125  output out;
126  out.delta = value_type(0.0);
127  out.global_lle = _compute_lle_j(j, out.delta);
128  out.local_lle = out.global_lle;
129 
130  static const auto tol = value_type(0.000001);
131  static const auto phi = value_type(0.5 * (sqrt(5.0) + 1.0));
132 
133  const auto dr_phi = (range_high - range_low) / phi;
134 
135  auto a = range_low;
136  auto b = range_high;
137  auto c = range_high - dr_phi;
138  auto d = range_low + dr_phi;
139 
140  while (std::abs(c - d) > tol)
141  {
142  if (_compute_lle_j(j, c) > _compute_lle_j(j, d))
143  b = d;
144  else
145  a = c;
146 
147  c = b - (b - a) / phi;
148  d = a + (b - a) / phi;
149  }
150 
151  const auto gss_delta = value_type(0.5) * (a + b);
152  const auto gss_lle = _compute_lle_j(j, gss_delta);
153 
154  if (gss_lle > out.local_lle)
155  {
156  out.delta = gss_delta;
157  out.local_lle = gss_lle;
158  }
159 
160  output_action(out);
161  }
162  }
163 
164  ///
165  /// Runs the program based on command-line arguments.
166  /// \param a The command-line arguments.
167  ///
168  static void run(args & a)
169  {
170  const std::unique_ptr<genotype_matrix_type> g_ptr (
171  genotype_matrix_factory_type::create(a.pop<std::string>()));
172 
173  const matrix_type q (a.pop<std::string>());
174  const matrix_type f (a.pop<std::string>());
175  const matrix_type y (a.pop<std::string>());
176  a.validate_empty();
177 
178  std::cout << "d\tglobal_lle\tlocal_lle\tlle_ratio\n";
179 
180  const basic_neoscan neoscan (*g_ptr, q, f, y);
181 
182  neoscan.execute([](const output & out)
183  {
184  std::cout << out.to_string() << std::endl;
185  });
186  }
187 
188  private:
191 
192  // --------------------------------------------------------------------
193  void _compute_ab_ij(
194  const size_t i,
195  value_type & a_ij,
196  value_type & b_ij)
197  const
198  {
199  assert(i < _y.get_length());
200 
201  const auto K = _f_j.get_length();
202 
203  auto q_src = _q.get_data(i, 0);
204  auto f_j_src = _f_j.get_data();
205  const auto f_j_end = _f_j.get_data() + K;
206 
207  a_ij = value_type(0.0);
208  b_ij = value_type(0.0);
209 
210  while (f_j_src != f_j_end)
211  {
212  const auto q_ik = *q_src++;
213  const auto f_kj = *f_j_src++;
214 
215  a_ij += q_ik * f_kj;
216  b_ij += q_ik * (value_type(1.0) - f_kj);
217  }
218  }
219 
220  // --------------------------------------------------------------------
221  void _compute_f_j(
222  const size_t j,
223  const size_t i,
224  const value_type d)
225  const
226  {
227  static const auto epsilon = value_type(1.0e-6);
228  static const auto lower_epsilon = value_type(0.0) + epsilon;
229  static const auto upper_epsilon = value_type(1.0) - epsilon;
230 
231  const auto J = _f.get_width();
232  const auto K = _f.get_height();
233 
234  assert(j < J);
235  assert(i < _y.get_height());
236 
237  auto src_ptr = _f.get_data(0, j);
238  auto dst_ptr = _f_j.get_data();
239  const auto dst_end = _f_j.get_data() + K;
240  const auto dy = d * _y[i];
241 
242  while (dst_ptr != dst_end)
243  {
244  *dst_ptr++ = std::min(std::max(
245  lower_epsilon,
246  *src_ptr + dy),
247  upper_epsilon);
248  src_ptr += J;
249  }
250  }
251 
252  // --------------------------------------------------------------------
253  value_type _compute_lle_j(const size_t j, const value_type d) const
254  {
255  const auto I = _q.get_height();
256 
257  assert(d >= value_type(-1.0) && d <= value_type(+1.0));
258  assert(j < _g.get_width());
259 
260  if (_g.is_dgm())
261  {
262  const auto & g = _g.to_dgm().get_matrix();
263 
264  auto lle_all = value_type(0.0);
265 
266  for (size_t i = 0; i < I; i++)
267  {
268  value_type g_ij;
269  if (!_try_convert(g(i, j), g_ij))
270  continue;
271 
272  _compute_f_j(j, i, d);
273 
274  value_type a_ij, b_ij;
275  _compute_ab_ij(i, a_ij, b_ij);
276 
277  lle_all += std::log(a_ij) * g_ij;
278  lle_all += std::log(b_ij) * (value_type(2.0) - g_ij);
279  }
280 
281  return lle_all;
282  }
283 
284  if (_g.is_lgm())
285  {
286  const auto & g = _g.to_lgm();
287  const auto & g_AA = g.get_major_major_matrix();
288  const auto & g_Aa = g.get_major_minor_matrix();
289  const auto & g_aa = g.get_minor_minor_matrix();
290 
291  auto lle_all = value_type(0.0);
292 
293  for (size_t i = 0; i < I; i++)
294  {
295  if (_y[i] < value_type(0.0))
296  continue;
297 
298  const auto g_AA_ij = g_AA(i, j);
299  const auto g_Aa_ij = g_Aa(i, j);
300  const auto g_aa_ij = g_aa(i, j);
301 
302  _compute_f_j(j, i, d);
303 
304  value_type a_ij, b_ij;
305  _compute_ab_ij(i, a_ij, b_ij);
306 
307  lle_all += std::log(
308  (g_AA_ij * a_ij * a_ij) +
309  (g_aa_ij * b_ij * b_ij) +
310  (g_Aa_ij * a_ij * b_ij * value_type(2.0)));
311  }
312 
313  return lle_all;
314  }
315 
316  throw jade::error("unsupported genotype matrix");
317  }
318 
319  // --------------------------------------------------------------------
320  static matrix_type _init_y(
321  const matrix_type & years, ///< The years (loaded from a file).
322  const matrix_type & q) ///< The Q matrix.
323  {
324  if (!years.is_column_vector())
325  throw jade::error()
326  << "invalid years matrix has "
327  << years.get_width()
328  << " columns; expected column vector";
329 
330  const auto I = q.get_height();
331  assert(I > 0);
332 
333  if (years.get_height() != I)
334  throw jade::error()
335  << "inconsistent number of years specified ("
336  << years.get_height()
337  << "); expected height of Q matrix (" << I << ")";
338 
339  matrix_type out (I, 1);
340 
341  const auto max_value = years.get_max_value();
342  const auto min_value = years.get_min_value();
343  const auto avg_value = years.get_sum() / value_type(I);
344  const auto years_end = years.get_data() + I;
345  auto years_ptr = years.get_data();
346  auto out_ptr = out.get_data();
347 
348  while (years_ptr != years_end)
349  {
350  const auto y = *years_ptr++;
351  *out_ptr++ = y < value_type(0.0)
352  ? value_type(1.0)
353  : (avg_value - y) / std::max(max_value - y, y - min_value);
354  }
355 
356  return out;
357  }
358 
359  // --------------------------------------------------------------------
360  static bool _try_convert(const genotype g, value_type & out)
361  {
362  switch (g)
363  {
364  case jade::genotype_major_major:
365  out = value_type(0.0);
366  return true;
367 
368  case jade::genotype_major_minor:
369  out = value_type(1.0);
370  return true;
371 
372  case jade::genotype_minor_minor:
373  out = value_type(2.0);
374  return true;
375 
376  default:
377  return false;
378  }
379  }
380 
381  const genotype_matrix_type & _g; // [I x J]
382  const matrix_type & _q; // [I x K]
383  const matrix_type & _f; // [K x J]
384  const matrix_type _y; // [I x 1]
385 
386  // scratch space
387  mutable matrix_type _f_j; // [K x 1]
388  };
389 }
390 
391 #endif // JADE_NEOSCAN_HPP__
jade::basic_neoscan
A template for a class implementing the neoscan algorithm. This algorithm scans for positive selectio...
Definition: jade.neoscan.hpp:22
jade::basic_verification
A template for a class that performs validation on various types of matrices.
Definition: jade.verification.hpp:20
jade::basic_args::validate_empty
void validate_empty() const
Throws an exception if there are more arguments to be processed.
Definition: jade.args.hpp:161
jade::basic_neoscan::output::compute_lle_ratio
value_type compute_lle_ratio() const
Definition: jade.neoscan.hpp:58
jade::basic_neoscan::output::global_lle
value_type global_lle
The global lle.
Definition: jade.neoscan.hpp:51
jade::basic_matrix::get_data
const value_type * get_data() const
Definition: jade.matrix.hpp:542
jade::basic_genotype_matrix_factory::create
static genotype_matrix_type * create(const std::string &path)
Creates a genotype matrix based on values from a file. This function determines what kind of genotype...
Definition: jade.genotype_matrix_factory.hpp:48
jade::basic_matrix::get_min_max_column
bool get_min_max_column(const size_t column, value_type &min, value_type &max) const
Returns the minimum and maximum elements in a column. If the matrix is empty, this method returns fal...
Definition: jade.matrix.hpp:686
jade::basic_matrix::get_length
size_t get_length() const
Definition: jade.matrix.hpp:624
jade::basic_neoscan::execute
void execute(const TOutputAction &output_action) const
Executes the algorithm using the specified action for the output of each column.
Definition: jade.neoscan.hpp:112
jade::basic_likelihood_genotype_matrix
A template class implementing operations for a likelihood genotype matrix type.
Definition: jade.likelihood_genotype_matrix.hpp:21
jade::basic_neoscan::basic_neoscan
basic_neoscan(const genotype_matrix_type &g, const matrix_type &q, const matrix_type &f, const matrix_type &years)
Initializes a new instance of the class. Memory for the specified G, Q, and Fa matrices must remain v...
Definition: jade.neoscan.hpp:89
jade::basic_matrix::get_width
size_t get_width() const
Definition: jade.matrix.hpp:757
jade::basic_neoscan::genotype_matrix_type
basic_genotype_matrix< value_type > genotype_matrix_type
The genotype matrix type.
Definition: jade.neoscan.hpp:38
jade::basic_neoscan::output::to_string
std::string to_string() const
Returns the delta, global_lle, local_lle, and computed log-likelihood ratio converted to high-precisi...
Definition: jade.neoscan.hpp:71
jade::basic_genotype_matrix
A template for an abstract class implementing operations for a genotype matrix.
Definition: jade.genotype_matrix.hpp:26
jade::basic_neoscan::verification_type
basic_verification< value_type > verification_type
The verification type.
Definition: jade.neoscan.hpp:35
jade::basic_matrix< value_type >::set_high_precision
static void set_high_precision(std::ostream &out)
Sets the specified stream to scientific notation with a high precision.
Definition: jade.matrix.hpp:1154
jade::basic_genotype_matrix::get_width
virtual size_t get_width() const =0
jade::basic_neoscan::output
Output for one column.
Definition: jade.neoscan.hpp:49
jade::basic_neoscan::value_type
TValue value_type
The value type.
Definition: jade.neoscan.hpp:29
jade::basic_args::pop
TValue pop()
Definition: jade.args.hpp:96
jade::basic_neoscan::run
static void run(args &a)
Runs the program based on command-line arguments.
Definition: jade.neoscan.hpp:168
jade::basic_matrix::get_height
size_t get_height() const
Definition: jade.matrix.hpp:603
jade::basic_verification::validate_gqf_sizes
static bool validate_gqf_sizes(const genotype_matrix_type &g, const matrix_type &q, const matrix_type &f)
Validates the sizes of the G, Q, and F matrices and throws an exception if validation fails.
Definition: jade.verification.hpp:211
jade::basic_discrete_genotype_matrix
A template for a class implementing operations for a discrete genotype matrix.
Definition: jade.discrete_genotype_matrix.hpp:22
jade::basic_args
A template for a class that helps process command-line arguments.
Definition: jade.args.hpp:19
jade::basic_neoscan::genotype_matrix_factory_type
basic_genotype_matrix_factory< value_type > genotype_matrix_factory_type
The genotype matrix factory type.
Definition: jade.neoscan.hpp:43
jade::basic_genotype_matrix_factory
A template for a class that creates genotype matrices based on files and their file extensions.
Definition: jade.genotype_matrix_factory.hpp:21
jade::basic_error
A template for a class representing an exception thrown from this namespace.
Definition: jade.error.hpp:20
jade::basic_matrix< value_type >
jade::basic_neoscan::output::local_lle
value_type local_lle
The local lle.
Definition: jade.neoscan.hpp:52
jade::basic_neoscan::output::delta
value_type delta
The delta.
Definition: jade.neoscan.hpp:50
jade::basic_neoscan::matrix_type
basic_matrix< value_type > matrix_type
The matrix type.
Definition: jade.neoscan.hpp:32