Skip to content
Snippets Groups Projects
BasePattern.java 14.7 KiB
Newer Older
  • Learn to ignore specific revisions
  • tjc's avatar
    tjc committed
    /* BasePattern.java
     *
     * created: Sun Jan 10 1999
     *
     * This file is part of Artemis
     *
     * Copyright (C) 1998,1999,2000  Genome Research Limited
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public License
     * as published by the Free Software Foundation; either version 2
     * of the License, or (at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with this program; if not, write to the Free Software
     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
     *
     * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/sequence/BasePattern.java,v 1.1 2004-06-09 09:52:12 tjc Exp $
     */
    
    package uk.ac.sanger.artemis.sequence;
    
    import uk.ac.sanger.artemis.util.*;
    
    /**
     *  A BasePattern is a String that describes a sequence of bases.
     *
     *  @author Kim Rutherford
     *  @version $Id: BasePattern.java,v 1.1 2004-06-09 09:52:12 tjc Exp $
     **/
    
    public class BasePattern {
      /**
       *  Create a new BasePattern object.
       *  @param pattern_string The is a String representation of the pattern.
       **/
      public BasePattern (final String pattern_string)
          throws BasePatternFormatException {
        if (pattern_string.length () < 1) {
          throw new BasePatternFormatException ("pattern too short");
        }
        this.pattern_string = pattern_string.toLowerCase ();
    
        this.pattern_type = patternType (this.pattern_string);
    
        if (pattern_type == ILLEGAL_PATTERN) {
          throw new BasePatternFormatException ("illegal characters in pattern");
        }
      }
    
      /**
       *  A return value of patternType () - illegal characters in the pattern.
       **/
      private static int ILLEGAL_PATTERN = -1;
    
      /**
       *  A return value of patternType () - the pattern contains only the
       *  characters 'a', 't', 'g' and 'c'.
       **/
      private static int SIMPLE_PATTERN = 0;
    
      /**
       *  A return value of patternType () - the pattern contains only the
       *  characters a,t,g,c,r,y,k,m,s,w,n,b,d,h and v, ie IUC base codes.
       **/
      private static int IUC_PATTERN = 1;
    
      /**
       *  Return a String representation of this BasePattern.
       **/
      public String toString () {
        return pattern_string;
      }
    
      /**
       *  Returns true if and only if this pattern matches the given String.  The
       *  match must be exact so the pattern and the string must be the same
       *  length.
       **/
      public boolean matches (final String match_string) {
        if (match_string.length () == pattern_string.length () &&
            searchFor (match_string, 0) == 0) {
          return true;
        } else {
          return false;
        }
      }
    
      /**
       *  Find the next match of this pattern in either Strand of the given
       *  Bases object.  This method searches both strands simultaneously by
       *  searching the underlying Bases object directly.
       *  @param bases This holds the Strand objects to search.
       *  @param search_start_marker The match that will be returned will be
       *    after this base Marker position.  Position 1 on the reverse strand is
       *    considered to be after position 1 on the forward strand, forward
       *    position 2 is after reverse position 1, reverse position 2 is after
       *    forward position 2, etc.  This scheme allows the caller to iterate
       *    through all matches.
       *  @param search_end_position The search will not extend past this
       *    position.
       *  @param search_backwards If true the search will move from last base to
       *    first base, otherwise first to last.
       *  @return A MarkerRange covering the matching bases or null if there is no
       *    match in the given range.
       **/
      public MarkerRange findMatch (final Bases bases,
                                    final Marker search_start_marker,
                                    final int search_end_position,
                                    final boolean search_backwards) {
        final String bases_string = bases.toString ();
    
        // search the bases_string forward for the pattern_string and its
        // complement
    
        // the String index position in bases_string at which to start the search
        // for this pattern
        final int forward_search_start_index;
    
        // the String index position in bases_string at which to start the search
        // for the reverse complement of this position
        final int complement_search_start_index;
    
        if (search_backwards) {
          if (search_start_marker == null) {
            forward_search_start_index = bases.getLength () - 1;
            complement_search_start_index = bases.getLength () - 1;
          } else {
            complement_search_start_index =
              search_start_marker.getRawPosition () - 2;
            if (search_start_marker.getStrand ().isForwardStrand ()) {
              forward_search_start_index =
                search_start_marker.getRawPosition () - 2;
            } else {
              forward_search_start_index =
                search_start_marker.getRawPosition () - 1;
            }
          }
        } else {
          if (search_start_marker == null) {
            forward_search_start_index = 0;
            complement_search_start_index = 0;
          } else {
            forward_search_start_index = search_start_marker.getRawPosition ();
            if (search_start_marker.getStrand ().isForwardStrand ()) {
              complement_search_start_index =
                search_start_marker.getRawPosition () - 1;
            } else {
              complement_search_start_index =
                search_start_marker.getRawPosition ();
            }
          }
        }
        
        final int forward_search_result =
          searchFor (bases_string,
                     pattern_string,
                     forward_search_start_index,
                     search_backwards);
    
        final int complement_search_result =
          searchFor (bases_string,
                     Bases.reverseComplement (pattern_string),
                     complement_search_start_index,
                     search_backwards);
    
        final int match_first_base;
        final int match_last_base;
    
        final Strand match_strand;
    
        if (forward_search_result == -1) {
          if (complement_search_result == -1) {
            // no match
            return null;
          }
        }
    
        if (search_backwards) {
          // take the match that is closest to the end, or the complement match if
          // there is a tie
          if (complement_search_result != -1 &&
              (forward_search_result == -1 ||
               forward_search_result != -1 &&
               complement_search_result >= forward_search_result)) {
            match_first_base =
              bases.getComplementPosition (complement_search_result + 1);
            match_last_base = match_first_base - (pattern_string.length () - 1);
            match_strand = bases.getReverseStrand ();
          } else {
            match_first_base = forward_search_result + 1;
            match_last_base = match_first_base + pattern_string.length () - 1;
            match_strand = bases.getForwardStrand ();
          }
        } else {
          // take the match that is closest to base 1, or the forward match if
          // there is a tie
          if (forward_search_result != -1 &&
              (complement_search_result == -1 ||
               complement_search_result != -1 &&
               forward_search_result <= complement_search_result)) {
            match_first_base = forward_search_result + 1;
            match_last_base = match_first_base + pattern_string.length () - 1;
            match_strand = bases.getForwardStrand ();
          } else {
            match_first_base =
              bases.getComplementPosition (complement_search_result + 1);
            match_last_base = match_first_base - (pattern_string.length () - 1);
            match_strand = bases.getReverseStrand ();
          }
        }
    
        if (match_last_base > search_end_position) {
          // there is no match within the range
          return null;
        }
    
        try {
          return new MarkerRange (match_strand,
                                  match_first_base,
                                  match_last_base);
        } catch (OutOfRangeException e) {
          throw new Error ("internal error - unexpected exception: " + e);
        }
      }
    
      /**
       *  Find the all matches of this pattern in either Strand of the given
       *  Bases object.  This method searches both strands simultaneously by
       *  searching the underlying Bases object directly.
       *  @param bases This holds the Strand objects to search.
       *  @param search_start_marker The search will start at this Marker
       *    position.  See the comments on the search_start_marker argument to
       *    findMatch ().
       *  @param search_end_position The search will not extend past this base
       *    position.
       *  @return A MarkerRangeVector holding all the matches or null if there are
       *    no matches in the given range.
       **/
      public MarkerRangeVector findMatches (final Bases bases,
                                            final Marker search_start_marker,
                                            final int search_end_position) {
        final MarkerRangeVector return_vector = new MarkerRangeVector ();
    
        Marker current_position_marker = search_start_marker;
    
        while (true) {
          final MarkerRange new_match_position =
            findMatch (bases, current_position_marker, search_end_position, false);
    
          if (new_match_position == null) {
            break;
          } else {
            current_position_marker = new_match_position.getRawStart ();
            return_vector.add (new_match_position);
          }
        }
    
        return return_vector;
      }
    
      /**
       *  Search for this BasePattern in the given string.
       *  @param bases_string Search this String for the pattern.
       *  @param start_index This is the index in bases_string where the search
       *    should start.
       *  @return The index of the match or -1 if there is no match.
       **/
    
      private int searchFor (final String bases_string,
    
    tjc's avatar
    tjc committed
                            final int start_index) {
        return searchFor (bases_string, pattern_string, start_index, false);
      }
    
      /**
       *  Search for a pattern in a string.
       *  @param bases_string Search this String for the pattern.
       *  @param pattern_string This contains the pattern to search for and may
       *    include IUB base codes.
       *  @param start_index This is the index in bases_string where the search
       *    should start.
       *  @param search_backwards If true the search will move from last base to
       *    first base, otherwise first to last.
       *  @return The index of the match or -1 if there is no match.
       **/
      private int searchFor (final String bases_string,
                             final String pattern_string,
                             int start_index,
                             final boolean search_backwards) {
    
        if (search_backwards) {
          if (pattern_type == SIMPLE_PATTERN) {
            // indexOf () is much faster in this case
            return bases_string.lastIndexOf (pattern_string, start_index);
          }
    
          if (bases_string.length () - start_index < toString ().length ()) {
            start_index = bases_string.length () - toString ().length ();
          }
    
          for (int i = start_index ; i > 0 ; --i) {
            boolean match_failed = false;
            for (int pattern_index = 0 ;
                 pattern_index < pattern_string.length () ;
                 ++pattern_index) {
              if (charMatch (bases_string.charAt (i + pattern_index),
                             pattern_string.charAt (pattern_index))) {
                // OK, so continue with the inner loop
              } else {
                match_failed = true;
                break;
              }
            }
    
            if (match_failed) {
              // go around main loop again
            } else {
              // found a match
              return i;
            }
          }
        } else {
          if (pattern_type == SIMPLE_PATTERN) {
            // indexOf () is much faster in this case
            return bases_string.indexOf (pattern_string, start_index);
          }
    
          for (int i = start_index ;
               i < bases_string.length () - pattern_string.length () + 1;
               ++i) {
            boolean match_failed = false;
            for (int pattern_index = 0 ;
                 pattern_index < pattern_string.length () ;
                 ++pattern_index) {
    
    tjc's avatar
    tjc committed
              if (charMatch (bases_string.charAt (i + pattern_index),
                             pattern_string.charAt (pattern_index))) {
                // OK, so continue with the inner loop
              } else {
                match_failed = true;
                break;
              }
            }
    
            if (match_failed) {
              // go around main loop again
            } else {
              // found a match
              return i;
            }
          }
        }
    
        return -1;
      }
    
      /**
       *  Check a base to see if it matches an IUC base code.
       *  @param base_char The base character to be checked.
       *  @param pattern_char The single letter IUC base code to match the
       *    character against.
       **/
      private boolean charMatch (final char base_char,
                                 final char pattern_char) {
        switch (base_char) {
        case 'c':
          switch (pattern_char) {
          case 'c':
          case 'y': case 'm': case 's': case 'n': case 'b': case 'h': case 'v':
            return true;
          }
          break;
        case 't':
          switch (pattern_char) {
          case 't':
          case 'y': case 'k': case 'w': case 'n': case 'b': case 'd': case 'h':
            return true;
          }
          break;
        case 'a':
          switch (pattern_char) {
          case 'a':
          case 'r': case 'm': case 'w': case 'n': case 'd': case 'h': case 'v':
            return true;
          }
          break;
        case 'g':
          switch (pattern_char) {
          case 'g':
          case 'r': case 'k': case 's': case 'n': case 'b': case 'd': case 'v':
            return true;
          }
          break;
        default:
          if (pattern_char == 'n') {
            return true;
          }
        }
    
        return false;
      }
    
      /**
       *  Return the pattern type of the argument.  Returns ILLEGAL_PATTERN if
       *  there are illegal characters in the pattern.  SIMPLE_PATTERN if the
       *  pattern contains only the characters 'a', 't', 'g' and 'c'.  IUC_PATTERN
       *  if the pattern contains only the characters a,t,g,c,r,y,k,m,s,w,n,b,d,h
       *  and v, ie IUC base codes.
       **/
      private static int patternType (String pattern_string) {
        boolean seen_iuc = false;
    
        for (int i = 0 ; i < pattern_string.length () ; ++i) {
          switch (pattern_string.charAt (i)) {
          case 'r': case 'y': case 'k': case 'm': case 's':
          case 'w': case 'n': case 'b': case 'd': case 'h': case 'v':
            seen_iuc = true;
            break;
          case 'a': case 't': case 'g': case 'c':
            // no problem
            break;
          default:
            // anything else is illegal
            return ILLEGAL_PATTERN;
          }
        }
    
        if (seen_iuc) {
          return IUC_PATTERN;
        } else {
          return SIMPLE_PATTERN;
        }
      }
    
      /**
       *  The pattern that was passed to the constructor.
       **/
      final String pattern_string;
    
      /**
       *  The type of this pattern, SIMPLE_PATTERN, IUC_PATTERN etc.
       **/
      final int pattern_type;
    }