001 /*
002 * PairwiseAlignmentAlgorithm.java
003 *
004 * Copyright 2003 Sergio Anibal de Carvalho Junior
005 *
006 * This file is part of NeoBio.
007 *
008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of
009 * the GNU General Public License as published by the Free Software Foundation; either
010 * version 2 of the License, or (at your option) any later version.
011 *
012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
014 * PURPOSE. See the GNU General Public License for more details.
015 *
016 * You should have received a copy of the GNU General Public License along with NeoBio;
017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
018 * Boston, MA 02111-1307, USA.
019 *
020 * Proper attribution of the author as the source of the software would be appreciated.
021 *
022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net
023 * Department of Computer Science http://www.dcs.kcl.ac.uk
024 * King's College London, UK http://www.kcl.ac.uk
025 *
026 * Please visit http://neobio.sourceforge.net
027 *
028 * This project was supervised by Professor Maxime Crochemore.
029 *
030 */
031
032 package neobio.alignment;
033
034 import java.io.Reader;
035 import java.io.IOException;
036
037 /**
038 * This abstract class is the superclass of all classes implementing pairwise sequence
039 * alignment algorithms. Subclasses are required to provide methods to build a high
040 * scoring alignment between two sequences and compute its score with a given scoring
041 * scheme.
042 *
043 * <P>Clients are required to set a scoring scheme and load two sequences before
044 * requesting an alignment or the computation of its score. They typically make the
045 * following sequence of method calls:</P>
046 *
047 * <CODE><BLOCKQUOTE><PRE>
048 * // prepare
049 * PairwiseAlignmentAlgorithm algorithm = new SomePairwiseAlignmentAlgorith ();
050 * algorithm.setScoringScheme (some_scoring_scheme);
051 * algorithm.loadSequences (sequence1, sequence2);
052 *
053 * // now compute the alignment
054 * PairwiseAlignment alignment = algorithm.getPairwiseAlignment();
055 * int score = algorithm.getScore();
056 * </PRE></BLOCKQUOTE></CODE>
057 *
058 * @author Sergio A. de Carvalho Jr.
059 * @see PairwiseAlignment
060 */
061 public abstract class PairwiseAlignmentAlgorithm
062 {
063 /**
064 * Tag character that signals a match in the score tag line of an alignment. Its use
065 * is conditioned by the <CODE>use_match_tag</CODE> flag.
066 *
067 * @see #use_match_tag
068 * @see #useMatchTag
069 */
070 protected static final char MATCH_TAG = '|';
071
072 /**
073 * Tag character that signals an approximate match in the score tag line of an
074 * alignment.
075 */
076 protected static final char APPROXIMATE_MATCH_TAG = '+';
077
078 /**
079 * Character that signals a mismatch in the score tag line of an alignment.
080 */
081 protected static final char MISMATCH_TAG = ' ';
082
083 /**
084 * Character that signals a gap in the score tag line of an alignment.
085 */
086 protected static final char GAP_TAG = ' ';
087
088 /**
089 * Character that signals a gap in sequence.
090 */
091 protected static final char GAP_CHARACTER = '-';
092
093 /**
094 * Indicates if the <CODE>MATCH_TAG</CODE> tag should be used or not. If it is
095 * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE>
096 * tag in the score tag line of the alignment whenever a match occurs between
097 * characters of the two sequences. If it is <CODE>false</CODE> the matching character
098 * should be written instead. This flag is updated whenever a scoring scheme is set to
099 * this <CODE>PairwiseAlignmentAlgorithm</CODE> by the <CODE>setScoringScheme</CODE>
100 * method.
101 *
102 * @see #MATCH_TAG
103 * @see #useMatchTag
104 * @see #setScoringScheme
105 */
106 protected boolean use_match_tag;
107
108 /**
109 * The scoring scheme used to compute a pairwise sequence alignment. It must be set
110 * before performing the alignment, and if a new scoring scheme is set, any alignment
111 * or score already computed is lost.
112 */
113 protected ScoringScheme scoring;
114
115 /**
116 * Stores the product of the last pairwise alignment performed. It contains a string
117 * representation of a highest scoring alignment between the two sequences and its
118 * score. It is set after a successful execution of the
119 * <CODE>computePairwiseAlignment</CODE> method that subclasses must implement. It is
120 * set to null if new sequences are loaded or a new scoring scheme is set.
121 */
122 protected PairwiseAlignment alignment;
123
124 /**
125 * This field stores just the score of the last pairwise alignment performed (if the
126 * <CODE>score_computed flag</CODE> is set to true). It is useful when just the score
127 * is needed (and not the alignment itselft). Its value is set after a successful
128 * execution of both <CODE>computePairwiseAlignment</CODE> or
129 * <CODE>computeScore</CODE> methods that subclasses must implement. If new sequences
130 * are loaded or a new scoring scheme is set, the <CODE>score_computed</CODE> flag is
131 * set to false, and this field's value becomes undefined.
132 */
133 protected int score;
134
135 /**
136 * Flags whether the score of the alignment between the last two loaded sequences has
137 * already been computed. It is set to true after a successful execution of both
138 * <CODE>computePairwiseAlignment</CODE> or <CODE>computeScore</CODE> methods that
139 * subclasses must implement. It is set to falsef if new sequences are loaded or a new
140 * scoring scheme is set.
141 */
142 protected boolean score_computed = false;
143
144 /**
145 * Flags whether sequences have been loaded. It is set to true after subclasses
146 * successfully load two sequences.
147 */
148 protected boolean sequences_loaded = false;
149
150 /**
151 * Sets the scoring scheme to be used for the next alignments. Any alignment or score
152 * already computed is lost. If the scoring scheme supports partial matches, this
153 * <CODE>PairwiseAlignmentAlgorithm</CODE> is set not to use the
154 * <CODE>MATCH_TAG</CODE> tag because in this case the score tag line be confusing.
155 * If the scoring scheme does not support partial matches, then the use of the
156 * <CODE>MATCH_TAG</CODE> tag is enabled.
157 *
158 * @param scoring Scoring scheme to be used
159 * @see #MATCH_TAG
160 * @see ScoringScheme#isPartialMatchSupported
161 */
162 public void setScoringScheme (ScoringScheme scoring)
163 {
164 if (scoring == null)
165 throw new IllegalArgumentException ("Null scoring scheme object.");
166
167 this.scoring = scoring;
168
169 // if the scoring scheme supports partial matches,
170 // disable the use of the MATCH_TAG character
171 if (scoring.isPartialMatchSupported())
172 this.use_match_tag = false;
173 else
174 this.use_match_tag = true;
175
176 // when a new scoring scheme is set,
177 // the alignment needs to be recomputed
178 this.alignment = null;
179 this.score_computed = false;
180 }
181
182 /**
183 * Tells wether the <CODE>MATCH_TAG</CODE> tag should be used or not. If it returns
184 * <CODE>true</CODE>, the alignment algorithm should write the <CODE>MATCH_TAG</CODE>
185 * tag in the score tag line of the alignment produced whenever a match occurs between
186 * characters of the two sequences. If it returns <CODE>false</CODE> the matching
187 * character should be written instead. The value returned is conditioned by the
188 * <CODE>use_match_tag</CODE> flag, which is updated whenever a scoring scheme is set
189 * to this <CODE>PairwiseAlignmentAlgorithm</CODE> by the
190 * <CODE>setScoringScheme</CODE> method.
191 *
192 * @return <CODE>true</CODE if the <CODE>MATCH_TAG</CODE> tag should be used,
193 * <CODE>false</CODE> otherwise
194 * @see #MATCH_TAG
195 * @see #use_match_tag
196 * @see #setScoringScheme
197 */
198 protected boolean useMatchTag ()
199 {
200 return use_match_tag;
201 }
202
203 /**
204 * Request subclasses to load the sequences according to their own needs. Any
205 * alignment and score already computed is lost. If no exception is raised, the loaded
206 * flag is set to true. Subclasses typically store the sequences in instances of an
207 * appropiate class and each can have its own contract, so check each algorithm to see
208 * what kind of sequences it produces. Input can come from any source provided they
209 * are encapsulated with a proper Reader. They must be ready to be read, i.e. the next
210 * read operation must return the sequence's first character.
211 *
212 * @param input1 First sequence
213 * @param input2 Second sequence
214 * @throws IOException If an I/O error occurs when reading the sequences
215 * @throws InvalidSequenceException If the sequences are not valid
216 */
217 public void loadSequences (Reader input1, Reader input2)
218 throws IOException, InvalidSequenceException
219 {
220 // when new sequences are loaded, the
221 // alignment and score needs to be recomputed
222 this.alignment = null;
223 this.score_computed = false;
224
225 // make sure that if an exception is raised
226 // the sequences_loaded flag is false
227 this.sequences_loaded = false;
228
229 // request subclasses to load sequences
230 loadSequencesInternal (input1, input2);
231
232 // if no exception is raised,
233 // set the loaded flag to true
234 this.sequences_loaded = true;
235 }
236
237 /**
238 * Frees pointer to loaded sequences and computed alignments (if any) so that their
239 * data can be garbage collected.
240 */
241 public void unloadSequences ()
242 {
243 // allow any alignment already computed
244 // to be garbage collected
245 this.alignment = null;
246 this.score_computed = false;
247
248 // request subclasses to unload sequences
249 unloadSequencesInternal ();
250
251 this.sequences_loaded = false;
252 }
253
254 /**
255 * Return the last pairwise alignment computed (if any) or request subclasses to
256 * compute one and return the result by calling the
257 * <CODE>computePairwiseAlignment</CODE> method. The sequences must already be loaded
258 * and a scoring scheme must already be set.
259 *
260 * @return a pairwise alignment between the loaded sequences
261 * @throws IncompatibleScoringSchemeException If the scoring scheme
262 * is not compatible with the loaded sequences
263 * @see #computePairwiseAlignment
264 */
265 public PairwiseAlignment getPairwiseAlignment ()
266 throws IncompatibleScoringSchemeException
267 {
268 if (!sequences_loaded)
269 throw new IllegalStateException ("Sequences have not been loaded.");
270
271 if (scoring == null)
272 throw new IllegalStateException ("Scoring scheme has not been set.");
273
274 if (this.alignment == null)
275 {
276 // make sure the scoring scheme won't be changed
277 // in the middle of the alignment computation
278 synchronized (scoring)
279 {
280 // compute the alignment if it hasn't been computed yet
281 this.alignment = computePairwiseAlignment();
282 }
283
284 // store the score as well
285 this.score = this.alignment.getScore();
286 this.score_computed = true;
287 }
288
289 return this.alignment;
290 }
291
292 /**
293 * Returns the score of the last alignment computed (if any) or request subclasses to
294 * compute one and return the result by calling the <CODE>computeScore</CODE> method.
295 * The sequences must already be loaded and a scoring scheme must already be set.
296 *
297 * @return the score of the alignment between the loaded sequences
298 * @throws IncompatibleScoringSchemeException If the scoring scheme
299 * is not compatible with the loaded sequences
300 * @see #computeScore
301 */
302 public int getScore () throws IncompatibleScoringSchemeException
303 {
304 if (!sequences_loaded)
305 throw new IllegalStateException ("Sequences have not been loaded.");
306
307 if (scoring == null)
308 throw new IllegalStateException ("Scoring scheme has not been set.");
309
310 if (!score_computed)
311 {
312 // make sure the scoring scheme won't be changed
313 // in the middle of the alignment computation
314 synchronized (scoring)
315 {
316 // compute the alignment's score if it hasn't been computed yet
317 this.score = computeScore();
318 }
319
320 this.score_computed = true;
321 }
322
323 return this.score;
324 }
325
326 /**
327 * Subclasses must implement this method to load sequences according to their own
328 * needs and throw an exception in case of any failure. If no exception is raised, the
329 * loaded flag is set to true by the public method and the sequences are believed to
330 * be loaded (so an alignment or score can be requested).
331 *
332 * @param input1 First sequence
333 * @param input2 Second sequence
334 * @throws IOException If an I/O error occurs when reading the sequences
335 * @throws InvalidSequenceException If the sequences are not valid
336 * @see #loadSequences
337 * @see CharSequence
338 * @see FactorSequence
339 */
340 protected abstract void loadSequencesInternal (Reader input1, Reader input2)
341 throws IOException, InvalidSequenceException;
342
343 /**
344 * Subclasses must implement this method to unload sequences according to their own
345 * storage, freeing pointers to sequences and any intermediate data so that they can
346 * be garbage collected. This methid is called by the public
347 * <CODE>unloadSequences</CODE> method.
348 *
349 * @see #unloadSequences
350 */
351 protected abstract void unloadSequencesInternal ();
352
353 /**
354 * Subclasses must implement this method to compute an alignment between the loaded
355 * sequences using the scoring scheme previously set. This methid is called by the
356 * <CODE>getPairwiseAlignment</CODE> method when needed.
357 *
358 * @return a pairwise alignment between the loaded sequences
359 * @throws IncompatibleScoringSchemeException If the scoring scheme
360 * is not compatible with the loaded sequences
361 * @see #getPairwiseAlignment
362 */
363 protected abstract PairwiseAlignment computePairwiseAlignment ()
364 throws IncompatibleScoringSchemeException;
365
366 /**
367 * Subclasses must implement this method to compute the score of the alignment between
368 * the loaded sequences using the scoring scheme previously set. This methid is called
369 * by the <CODE>getScore</CODE> method when needed.
370 *
371 * @return the score of the alignment between the loaded sequences
372 * @throws IncompatibleScoringSchemeException If the scoring scheme
373 * is not compatible with the loaded sequences
374 * @see #getScore
375 */
376 protected abstract int computeScore () throws IncompatibleScoringSchemeException;
377
378 /**
379 * Helper method to invoke the <CODE>scoreSubstitution</CODE> method of the scoring
380 * scheme set to this algorithm.
381 *
382 * @param a first character
383 * @param b second character
384 * @return score of substitution of <CODE>a</CODE> for <CODE>b</CODE>
385 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
386 * with the sequences being aligned
387 * @see ScoringScheme#scoreSubstitution
388 */
389 protected final int scoreSubstitution (char a, char b)
390 throws IncompatibleScoringSchemeException
391 {
392 return scoring.scoreSubstitution (a, b);
393 }
394
395 /**
396 * Helper method to invoke the <CODE>scoreInsertion</CODE> method of the scoring
397 * scheme set to this algorithm.
398 *
399 * @param a the character to be inserted
400 * @return score of insertion of <CODE>a</CODE>
401 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
402 * with the sequences being aligned
403 * @see ScoringScheme#scoreInsertion
404 */
405 protected final int scoreInsertion (char a) throws IncompatibleScoringSchemeException
406 {
407 return scoring.scoreInsertion (a);
408 }
409
410 /**
411 * Helper method to invoke the <CODE>scoreDeletion</CODE> method of the scoring scheme
412 * set to this algorithm.
413 *
414 * @param a the character to be deleted
415 * @return score of deletion of <CODE>a</CODE>
416 * @throws IncompatibleScoringSchemeException if the scoring scheme is not compatible
417 * with the sequences being aligned
418 * @see ScoringScheme#scoreDeletion
419 */
420 protected final int scoreDeletion (char a) throws IncompatibleScoringSchemeException
421 {
422 return scoring.scoreDeletion (a);
423 }
424
425 /**
426 * Helper method to compute the the greater of two values.
427 *
428 * @param v1 first value
429 * @param v2 second value
430 * @return the larger of <CODE>v1</CODE> and <CODE>v2</CODE>
431 */
432 protected final int max (int v1, int v2)
433 {
434 return (v1 >= v2) ? v1 : v2;
435 }
436
437 /**
438 * Helper method to compute the the greater of three values.
439 *
440 * @param v1 first value
441 * @param v2 second value
442 * @param v3 third value
443 * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> and <CODE>v3</CODE>
444 */
445 protected final int max (int v1, int v2, int v3)
446 {
447 return (v1 >= v2) ? ((v1 >= v3)? v1 : v3) : ((v2 >= v3)? v2 : v3);
448 }
449
450 /**
451 * Helper method to compute the the greater of four values.
452 *
453 * @param v1 first value
454 * @param v2 second value
455 * @param v3 third value
456 * @param v4 fourth value
457 * @return the larger of <CODE>v1</CODE>, <CODE>v2</CODE> <CODE>v3</CODE> and
458 * <CODE>v4</CODE>
459 */
460 protected final int max (int v1, int v2, int v3, int v4)
461 {
462 int m1 = ((v1 >= v2) ? v1 : v2);
463 int m2 = ((v3 >= v4) ? v3 : v4);
464
465 return (m1 >= m2) ? m1 : m2;
466 }
467 }