Empirical
sequence_utils.h
Go to the documentation of this file.
1 
16 #ifndef EMP_SEQUENCE_UTILS_H
17 #define EMP_SEQUENCE_UTILS_H
18 
19 #include "../base/vector.h"
20 
21 #include "functions.h"
22 
23 namespace emp {
24 
25  // --- Distance functions for any array-type objects ---
26 
31  template <typename TYPE>
32  size_t calc_hamming_distance(const TYPE & in1, const TYPE & in2, int offset=0) {
33  if (offset < 0) return calc_hamming_distance(in2, in1, -offset);
34 
35  const auto size1 = in1.size();
36  const auto size2 = in2.size();
37 
38  // Calculate by how much the strings overlap.
39  size_t overlap = std::min( size1 - offset, size2 );
40 
41  // Initialize the distance to that part of the strings which do not overlap.
42  size_t num_diffs = size1 + size2 - 2 * overlap;
43 
44  // Step through the overlapped section and add on additional differences.
45  for (size_t i = 0; i < overlap; i++) {
46  if (in1[i + offset] != in2[i]) num_diffs++;
47  }
48 
49  return num_diffs;
50  }
51 
54  template <typename TYPE>
55  size_t calc_edit_distance(const TYPE & in1, const TYPE & in2) {
56  const auto size1 = in1.size();
57  const auto size2 = in2.size();
58 
59  // If either size is zero, other size indicates number of insertions needed to produce it.
60  if (size1 == 0) return size2;
61  if (size2 == 0) return size1;
62 
63  emp::vector<size_t> cur_row(size1); // The row we are calculating
64  emp::vector<size_t> prev_row(size1); // The previous row we calculated
65 
66  // Initialize the previous row to record the differece from nothing.
67  for (size_t i = 0; i < size1; i++) prev_row[i] = i + 1;
68 
69  // Loop through all other rows
70  for (size_t row = 0; row < size2; row++) {
71  // Initialize the first entry in the current row.
72  if (in1[0] == in2[row]) cur_row[0] = row;
73  else cur_row[0] = std::min(row, prev_row[0]) + 1;
74 
75  // Move through the cur_row and fill it in.
76  for (size_t col = 1; col < size1; col++) {
77  // If the values are equal, keep the value in the upper left.
78  if (in1[col] == in2[row]) cur_row[col] = prev_row[col-1];
79 
80  // Otherwise, set the current position the the minimal of the three
81  // numbers to the upper right in the chart plus one.
82  else {
83  cur_row[col] = emp::Min(prev_row[col], prev_row[col-1], cur_row[col-1]) + 1;
84  }
85  }
86 
87  // Swap cur_row and prev_row (keep cur vals in prev row, recycle vector cur_row)
88  std::swap(cur_row, prev_row);
89  }
90 
91  // Now that we are done, return the bottom-right corner of the chart.
92  return prev_row[size1 - 1];
93  }
94 
97  template <typename TYPE, typename GAP_TYPE>
98  size_t align(TYPE & in1, TYPE & in2, GAP_TYPE gap) {
99  const auto size1 = in1.size();
100  const auto size2 = in2.size();
101 
102  // If either size is zero, other size indicates number of insertions needed to produce it.
103  if (size1 == 0) return size2;
104  if (size2 == 0) return size1;
105 
106  emp::vector<size_t> cur_row(size1); // The row we are calculating
107  emp::vector<size_t> prev_row(size1); // The previous row we calculated
108  emp::vector<emp::vector<char> > edit_info(size2, emp::vector<char>(size1));
109 
110  // Initialize the previous row to record the differece from nothing.
111  for (size_t i = 0; i < size1; i++) {
112  prev_row[i] = i + 1;
113  edit_info[0][i] = 'i';
114  }
115 
116  // Loop through all other rows
117  for (size_t row = 0; row < size2; row++) {
118  // Initialize the first entry in the current row.
119  if (in1[0] == in2[row]) { cur_row[0] = row; edit_info[row][0] = 's'; }
120  else { cur_row[0] = prev_row[0] + 1; edit_info[row][0] = 'd'; }
121 
122  // Move through the cur_row and fill it in.
123  for (size_t col = 1; col < size1; col++) {
124  // If the values are equal, keep the value in the upper left.
125  if (in1[col] == in2[row]) { cur_row[col] = prev_row[col-1]; edit_info[row][col] = 's'; }
126 
127  // Otherwise, set the current position to the minimum of the three
128  // numbers to the left, upper, or upper left in the chart plus one.
129  else {
130  cur_row[col] = emp::Min(prev_row[col], prev_row[col-1], cur_row[col-1]) + 1;
131  if (cur_row[col] == prev_row[col]+1) { edit_info[row][col] = 'd'; }
132  if (cur_row[col] == prev_row[col-1]+1) { edit_info[row][col] = 's'; }
133  if (cur_row[col] == cur_row[col-1]+1) { edit_info[row][col] = 'i'; }
134  }
135  }
136 
137  // Swap cur_row and prev_row (keep cur vals in prev row, recycle vector cur_row)
138  std::swap(cur_row, prev_row);
139  }
140 
141  // Fill in gaps in the sequences to make them align!
142 
143  int c = (int) size1 - 1;
144  int r = (int) size2 - 1;
145  size_t length = 0;
146 
147  while (c >= 0 || r >= 0) {
148  if (c < 0) { ++length; --r; continue; }
149  else if (r < 0) { ++length; --c; continue; }
150 
151  char cur_move = edit_info[(size_t)r][(size_t)c];
152  switch(cur_move) {
153  case 's': --c; --r; ++length; break;
154  case 'd': --r; ++length; break;
155  case 'i': --c; ++length; break;
156  };
157  }
158 
159  c = (int) size1-1;
160  r = (int) size2-1;
161 
162  TYPE out1(length, gap);
163  TYPE out2(length, gap);
164 
165  size_t pos = length - 1;
166 
167  while (c >= 0 && r >= 0) {
168  switch(edit_info[(size_t)r][(size_t)c]) {
169  case 's': out1[pos] = in1[(size_t)c]; out2[pos] = in2[(size_t)r]; --c; --r; break;
170  case 'd': out1[pos] = gap; out2[pos] = in2[(size_t)r]; --r; break;
171  case 'i': out1[pos] = in1[(size_t)c]; out2[pos] = gap; --c; break;
172  };
173  --pos;
174  }
175  while (c >= 0) { out1[pos] = in1[(size_t)c]; --c; --pos; }
176  while (r >= 0) { out2[pos] = in2[(size_t)r]; --r; --pos; }
177 
178  in1 = out1;
179  in2 = out2;
180 
181  // Now that we are done, return the bottom-right corner of the chart.
182  return prev_row[size1 - 1];
183  }
184 
185 }
186 
187 #endif
constexpr T Min(T in1)
Min of only one element is that element itself!
Definition: math.h:50
size_t calc_edit_distance(const TYPE &in1, const TYPE &in2)
Definition: sequence_utils.h:55
size_t calc_hamming_distance(const TYPE &in1, const TYPE &in2, int offset=0)
Definition: sequence_utils.h:32
If we are in emscripten, make sure to include the header.
Definition: array.h:37
size_t align(TYPE &in1, TYPE &in2, GAP_TYPE gap)
Definition: sequence_utils.h:98