Skip to content
Snippets Groups Projects
UserDataAlgorithm.java 9.85 KiB
Newer Older
tjc's avatar
tjc committed
/* UserDataAlgorithm.java
 *
 * created: Wed May 10 2000
 *
 * This file is part of Artemis
 *
 * Copyright (C) 2000  Genome Research Limited
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 *
 * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/plot/UserDataAlgorithm.java,v 1.8 2009-06-24 14:42:33 tjc Exp $
tjc's avatar
tjc committed
 */

package uk.ac.sanger.artemis.plot;

import uk.ac.sanger.artemis.sequence.*;

import uk.ac.sanger.artemis.util.*;
import uk.ac.sanger.artemis.io.ReadFormatException;

import java.io.*;
tjc's avatar
tjc committed
import java.util.regex.Pattern;
tjc's avatar
tjc committed

/**
 *  Objects of this class have one useful method - getValues (), which takes a
 *  range of bases and returns a single floating point number.  The number is
 *  calculated by averaging the values from a data file.  The Strand to use is
 *  set in the constructor.
 *
 *  @author Kim Rutherford <kmr@sanger.ac.uk>
 *  @version $Id: UserDataAlgorithm.java,v 1.8 2009-06-24 14:42:33 tjc Exp $
tjc's avatar
tjc committed
 **/

tjc's avatar
tjc committed
public class UserDataAlgorithm extends BaseAlgorithm
{
  /** A base per line file format */
  private static int BASE_PER_LINE_FORMAT  = 1;
  
  /** Base position is specified in the first column file format */
  private static int BASE_SPECIFIED_FORMAT = 2;
  
  /** The data read by the constructor - for BASE_PER_LINE_FORMAT */
tjc's avatar
tjc committed
  private float data[][] = null;
  
  /** The data read by the constructor - for BASE_SPECIFIED_FORMAT */
  private HashMap<Integer, float[]> dataMap;
  
  /** The maximum value in the data array. */
tjc's avatar
tjc committed
  private float data_max = Float.MIN_VALUE;
tjc's avatar
tjc committed

  /** The minimum value in the data array. */
tjc's avatar
tjc committed
  private float data_min = Float.MAX_VALUE;
  
  /** Default window size */
  private int default_window_size = 3;
tjc's avatar
tjc committed

  /** The average calculated by readData (). */
tjc's avatar
tjc committed
  private float average_value = 0;

  /** The value returned by getValueCount (). */
tjc's avatar
tjc committed
  private int number_of_values;
  
  private boolean logTransform;
  
  /** Format type for this instance */
  private int FORMAT = BASE_PER_LINE_FORMAT;
  
tjc's avatar
tjc committed
  /**
   *  Create a new UserDataAlgorithm object. This reads a file
   *  which can be one of two types of formats:
   *  a. one line of values per base.
   *  b. the first column specifies the base position with
   *     subsequent columns being values.
tjc's avatar
tjc committed
   *  @param strand The strand to do the calculation on.
   *  @param document The Document to read the data from.
   *  @param logTransform true if the log transformation is to be
   *  shown.
tjc's avatar
tjc committed
   **/
tjc's avatar
tjc committed
  public UserDataAlgorithm (final Strand strand, final Document document, 
                            final boolean logTransform)
      throws IOException 
  {
tjc's avatar
tjc committed
    super (strand, "User algorithm from " + document.getName (), "user");

tjc's avatar
tjc committed
    this.logTransform = logTransform;
tjc's avatar
tjc committed
    final Reader document_reader = document.getReader ();

tjc's avatar
tjc committed
    LinePushBackReader pushback_reader = new LinePushBackReader (document_reader);
tjc's avatar
tjc committed

    String first_line = pushback_reader.readLine ();
    if(first_line.startsWith("#"))
    {
      FORMAT = BASE_SPECIFIED_FORMAT;
      first_line = pushback_reader.readLine ().trim();
      while(first_line.equals("") || first_line.equals("#"))
        first_line = pushback_reader.readLine ().trim();
    }
    else
      FORMAT = BASE_PER_LINE_FORMAT;
    
    final Pattern patt = Pattern.compile("\\s+");
    String tokens[] = patt.split(first_line);
    
    if (tokens.length < 1) 
tjc's avatar
tjc committed
      throw new ReadFormatException ("unknown file type");

    this.number_of_values = tokens.length;
tjc's avatar
tjc committed
    pushback_reader.pushBack (first_line);
    
    if(FORMAT == BASE_PER_LINE_FORMAT)
      data = new float [strand.getSequenceLength ()][tokens.length];
    
tjc's avatar
tjc committed
    readData (pushback_reader);
tjc's avatar
tjc committed
    pushback_reader.close();
tjc's avatar
tjc committed
  }

  /**
   *  Read all from buffered_reader into data.
   **/
  private void readData (final LinePushBackReader pushback_reader)
tjc's avatar
tjc committed
      throws IOException
  {
tjc's avatar
tjc committed
    String line = null;
    int count = 0;
    int countAll = 0;
    int estimate_window_size = Integer.MAX_VALUE;
tjc's avatar
tjc committed
    final int seqLength = getStrand ().getSequenceLength ();
    final Pattern patt = Pattern.compile("\\s+");
    
tjc's avatar
tjc committed
    while ((line = pushback_reader.readLine ()) != null)
    {
tjc's avatar
tjc committed
      if (count >= seqLength) 
tjc's avatar
tjc committed
        throw new ReadFormatException ("too many values in input file");

      String tokens[] = patt.split(line); 
      if (FORMAT == BASE_PER_LINE_FORMAT && tokens.length != data[0].length)
        throw new ReadFormatException ("line has the wrong number of fields:\n"+line);
      
      int base = 0;
      
      float line_data[] = new float[tokens.length-1];
      for (int i = 0 ; i < tokens.length ; ++i)
tjc's avatar
tjc committed
      {
tjc's avatar
tjc committed
        {
          if(FORMAT == BASE_SPECIFIED_FORMAT && i == 0)
tjc's avatar
tjc committed
          {
            int last_base = base;
            base = (int) Float.parseFloat(tokens[i]);
tjc's avatar
tjc committed
            
            if(base > seqLength)
              throw new ReadFormatException (
                  "a base position is greater than the sequence length:\n"+line);
tjc's avatar
tjc committed

            if((base - last_base) < estimate_window_size &&
               (base - last_base) > 0)
              estimate_window_size = base - last_base;
            if(dataMap == null)
              dataMap = new HashMap<Integer, float[]>();
            continue;
          }
            
          float value = Float.parseFloat(tokens[i]);
          if(logTransform)
            value = (float) Math.log(value+1);
tjc's avatar
tjc committed

          if (value > data_max) 
            data_max = value;
          if (value < data_min)
            data_min = value;
            
          if(FORMAT == BASE_PER_LINE_FORMAT)
tjc's avatar
tjc committed
            data[count][i] = value;
tjc's avatar
tjc committed
          {
            line_data[i-1] = value;
            countAll++;
tjc's avatar
tjc committed
          }
          average_value += value;
        } 
        catch (NumberFormatException e) 
        {
          throw new ReadFormatException ("cannot understand this number: " +
                                         tokens[i] + " - " +e.getMessage ());
tjc's avatar
tjc committed
        }
tjc's avatar
tjc committed
      ++count;
      if(FORMAT == BASE_SPECIFIED_FORMAT)
        dataMap.put(base, line_data);
tjc's avatar
tjc committed
    }

    if (FORMAT == BASE_PER_LINE_FORMAT)
      average_value /= data[0].length * seqLength;
    else
    {
      average_value = average_value/countAll;
      if(estimate_window_size != Integer.MAX_VALUE)
        default_window_size = estimate_window_size;
    }
tjc's avatar
tjc committed
  }

  /**
   *  Return the value of the function between a pair of bases.
   *  @param start The start base (included in the range).
   *  @param end The end base (included in the range).
   *  @param values The one return value for this algorithm is returned in
   *    this array.
   **/
tjc's avatar
tjc committed
  public void getValues (int start, int end, final float [] values) 
  {
tjc's avatar
tjc committed
    final int value_count = getValueCount ();

    if(getStrand ().getDirection() == Bases.REVERSE)
    {
      int tstart = start;
      int tend   = end;
      end   = getStrand().getBases().getComplementPosition(tstart);
      start = getStrand().getBases().getComplementPosition(tend);
    }
    
    if(FORMAT == BASE_SPECIFIED_FORMAT)
tjc's avatar
tjc committed
    {
      for (int i = 0 ; i < value_count ; ++i) 
      {
        values[i] = 0;
        int count = 0;
        for (int base = start ; base <= end ; ++base) 
        {
          if(dataMap.containsKey(base))
          {
            values[i] += ((float[])dataMap.get(base))[i];
            count++;
          }
        }

        if(count > 1)
          values[i] = values[i]/count;
      }
    }
    else
    {
      for (int i = 0 ; i < value_count ; ++i) 
      {
        values [i] = 0;
        for (int base = start ; base <= end ; ++base) 
          values [i] += data[base - 1][i] / (end - start + 1);
      }
tjc's avatar
tjc committed
    }
  }

  /**
   *  Return the number of values a call to getValues () will return - one
   *  in this case.
   **/
tjc's avatar
tjc committed
  public int getValueCount () 
  {
    if(FORMAT == BASE_SPECIFIED_FORMAT)
      return number_of_values -1;
tjc's avatar
tjc committed
    return number_of_values;
  }

  /**
   *  Return the default or optimal window size.
   *  @return null is returned if this algorithm doesn't have optimal window
   *    size.
   **/
tjc's avatar
tjc committed
  public Integer getDefaultWindowSize () 
  {
    return new Integer (default_window_size);
tjc's avatar
tjc committed
  }

  /**
   *  Return the default maximum window size for this algorithm.
   *  @return null is returned if this algorithm doesn't have maximum window
   *    size.
   **/
tjc's avatar
tjc committed
  public Integer getDefaultMaxWindowSize ()
  {
tjc's avatar
tjc committed
    return new Integer (100);
  }

  /**
   *  Return the default minimum window size for this algorithm.
   *  @return null is returned if this algorithm doesn't have minimum window
   *    size.
   **/
tjc's avatar
tjc committed
  public Integer getDefaultMinWindowSize () 
  {
tjc's avatar
tjc committed
    return new Integer (1);
  }

  /**
   *  Return the default or optimal step size.
   *  @return null is returned if this algorithm doesn't have optimal step
   *    size.
   **/
tjc's avatar
tjc committed
  public Integer getDefaultStepSize (int window_size)
  {
    if (window_size > 10) 
tjc's avatar
tjc committed
      return new Integer (window_size / 10);
tjc's avatar
tjc committed
    else 
tjc's avatar
tjc committed
      return null;
  }

  /**
   *  Return the maximum value of this algorithm.
   **/
tjc's avatar
tjc committed
  protected Float getMaximumInternal ()
  {
tjc's avatar
tjc committed
    return new Float (data_max);
  }

  /**
   *  Return the minimum value of this algorithm.
   **/
tjc's avatar
tjc committed
  protected Float getMinimumInternal () 
  {
tjc's avatar
tjc committed
    return new Float (data_min);
  }

  /**
   *  Return the average value of function over the whole strand.
   **/
tjc's avatar
tjc committed
  public Float getAverage () 
  {
tjc's avatar
tjc committed
    return new Float (average_value);
  }

}