Skip to content
Snippets Groups Projects
BCFReader.java 8.23 KiB
Newer Older
  • Learn to ignore specific revisions
  • tjc's avatar
    tjc committed
    /*
     * created: 2010
     *
     * This file is part of Artemis
     *
     * Copyright(C) 2010  Genome Research Limited
     *
     * This program is free software; you can redistribute it and/or
     * modify it under the terms of the GNU General Public License
     * as published by the Free Software Foundation; either version 2
     * of the License, or(at your option) any later version.
     *
     * This program is distributed in the hope that it will be useful,
     * but WITHOUT ANY WARRANTY; without even the implied warranty of
     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     * GNU General Public License for more details.
     *
     * You should have received a copy of the GNU General Public License
     * along with this program; if not, write to the Free Software
     * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
     *
     */
    
    
    tjc's avatar
    tjc committed
    package uk.ac.sanger.artemis.components.variant;
    
    import java.io.File;
    import java.io.FileInputStream;
    import java.io.IOException;
    import java.io.InputStream;
    import java.nio.ByteBuffer;
    import java.nio.ByteOrder;
    import java.util.List;
    import java.util.Vector;
    import java.util.regex.Pattern;
    
    import net.sf.samtools.util.BlockCompressedInputStream;
    
    
    class BCFReader
    {
      public static final int TAD_LIDX_SHIFT = 13; // linear index shift
    
    tjc's avatar
    tjc committed
      private static Pattern formatPattern = Pattern.compile("[^0-9]+");
      private BlockCompressedInputStream is;
    
    tjc's avatar
    tjc committed
      
    
    tjc's avatar
    tjc committed
      public void query(File bcf, long offset) throws IOException
    
    tjc's avatar
    tjc committed
      {
    
    tjc's avatar
    tjc committed
        is = new BlockCompressedInputStream(bcf);
    
    tjc's avatar
    tjc committed
        is.seek(offset);
    
    tjc's avatar
    tjc committed
      }
      
      
      public VCFRecord next(int bid, int beg, int end) throws IOException
      {
        try
    
    tjc's avatar
    tjc committed
        {
    
    tjc's avatar
    tjc committed
          VCFRecord bcfRecord = readVCFRecord();
          if(bcfRecord.pos >= beg && bcfRecord.pos <= end)
            return bcfRecord;
          else if(bcfRecord.pos < beg)
          {
            while( (bcfRecord = readVCFRecord()).pos <= beg )
            {
              if(bcfRecord.pos >= beg && bcfRecord.pos <= end)
                return bcfRecord;
            }
          }
        } 
        catch(Exception e)
        {
          if(is.read() != -1)  // eof
            e.printStackTrace();
        }
        
        return null;
      }
      
      private VCFRecord readVCFRecord() throws IOException
      {
        VCFRecord bcfRecord = new VCFRecord();
        bcfRecord.seqID = readInt(is);    
        bcfRecord.pos = readInt(is)+1;
        bcfRecord.quality = readFloat(is);
        
        int slen = readInt(is);
        byte[] str = new byte[slen];
        is.read(str);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
        String parts[] = getParts(str);
        
        bcfRecord.ref = parts[0];
        bcfRecord.alt = parts[1];
        String fmt = parts[parts.length-1];
    
        if(formatPattern.matcher(fmt).matches())
        {
          bcfRecord.info = parts[parts.length-2];
          bcfRecord.format = parts[parts.length-1];
    
    tjc's avatar
    tjc committed
          
    
    tjc's avatar
    tjc committed
          int nc  = 3;
          if(bcfRecord.alt.equals("."))
            nc = 1;
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          String fmts[] = bcfRecord.format.split(":");
          for(int j=0; j<fmts.length; j++)
    
    tjc's avatar
    tjc committed
          {
    
    tjc's avatar
    tjc committed
            int nb = getByteSize(fmts[j],1,nc);
            str = new byte[nb];
            is.read(str);
    
    tjc's avatar
    tjc committed
            
    
    tjc's avatar
    tjc committed
            final String value;
            if(fmts[j].equals("GT"))
              value = getGTString(str[0]);
            else if(fmts[j].equals("PL"))
              value = getPLString(str, nc);
            else if(fmts[j].equals("DP")||fmts[j].equals("SP")||fmts[j].equals("GQ"))
              value = Integer.toString(byteToInt(str[0]));
            else
              value = "";
            bcfRecord.data.put(fmts[j], value);
    
    tjc's avatar
    tjc committed
          }
    
    tjc's avatar
    tjc committed
          
    
    tjc's avatar
    tjc committed
        }
    
    tjc's avatar
    tjc committed
        else
          bcfRecord.info = parts[parts.length-1];
        
        return bcfRecord;
    
    tjc's avatar
    tjc committed
      }
      
      /**
       * Make a string from the byte array. Expanding NULL padding.
       * @param b
       * @return
       */
      private String[] getParts(byte[] b)
      {
        StringBuffer buff = new StringBuffer();
        for(int i=0; i<b.length; i++)
        {
          if(i == 0 && b[i] == 0)
            continue;
    
          if(b[i] == 0)
          {
            if(i > 0 && b[i-1] == 0 && b[i+1] == 0)
              buff.append(".");
            else
              buff.append(" ");
          }
          else
            buff.append((char)b[i]);
        }
        
        return buff.toString().split(" ");
      }
      
      /**
       * DP uint16 t[n] Read depth
       * GL float[n*x] Log10 likelihood of data; x = m(m+1)/2 , m = #{alleles}
       * GT uint8 t[n] phase<<6 | allele1<<3 | allele2
       * GQ uint8 t[n] Genotype quality
       * HQ uint8 t[n*2] Haplotype quality
       * PL uint8 t[n*x] Phred-scaled likelihood of data
       * misc int32 t+char* NULL padded concatenated strings (integer equal to the length)
       * @param tag
       * @param nsamples
       * @param nc
       * @return
       */
      private static int getByteSize(String tag, int nsamples, int nc)
      {
        if(tag.equals("DP"))        // Read depth
          return 2*nsamples;        // uint16_t[n]
        else if(tag.equals("GL"))   // Log10 likelihood
          return 4*nsamples*nc;     // float[nsamples*x]
        else if(tag.equals("GT"))   // phase<<6 | allele1<<3 | allele2
          return nsamples;          // uint8_t[n]
        else if(tag.equals("GQ"))   // Genotype quality
          return nsamples;          // uint8_t[n]
        else if(tag.equals("HQ"))   // Haplotype quality
          return 2*nsamples;        // uint8_t[n*2]
        else if(tag.equals("PL"))   // Phred-scaled likelihood
          return nsamples*nc;       // uint8_t[n*x]
        else if(tag.equals("SP"))   // 
          return nsamples;          // uint8_t[n]
        else                        // misc
          return 4*nsamples;        // uint32_t+char*
      }
      
      
      private String getPLString(byte[] b, int nc)
      {
        StringBuffer buff = new StringBuffer();
        for(int i=0;i<b.length; i++)
        {
          buff.append(byteToInt(b[i]));
          if(i<b.length-1)
            buff.append(",");
        }
        return buff.toString();
      }
      
      /**
       * GT genotype, allele values separated by / or |, i.e.
       * unphased or phased.
       * @param b
       * @return
       */
      private String getGTString(byte b)
      {
        return ((b >> 3) + ( (b >> 6 == 1) ? "|" : "/") + byteToInt(b));
      }
     
      private int byteToInt(byte b)
      {
        return (int)(b & 0xFF);
      }
      
    
    tjc's avatar
    tjc committed
      protected static List<BCFIndex> loadIndex(File bcfIndex) throws IOException
    
    tjc's avatar
    tjc committed
      {
        FileInputStream fis = new FileInputStream(bcfIndex);
        BlockCompressedInputStream is = new BlockCompressedInputStream(fis);
        byte[] magic = new byte[4];
        is.read(magic);
        
    
    tjc's avatar
    tjc committed
        if(!new String(magic).equals("BCI\4"))
          System.err.println("Not a BCF index file:: "+new String(magic));
    
    tjc's avatar
    tjc committed
        
    
    tjc's avatar
    tjc committed
        int n = readInt(is);
        List<BCFIndex> idx = new Vector<BCFIndex>(n);
    
    tjc's avatar
    tjc committed
        
        for(int i=0; i<n; i++)
        {
    
    tjc's avatar
    tjc committed
          BCFIndex idx2 = new BCFIndex();
    
    tjc's avatar
    tjc committed
          idx2.n = readInt(is);
          idx2.index2_offset = new long[idx2.n];
          
          for(int j=0; j<idx2.n; j++)
            idx2.index2_offset[j] = readLong(is);
    
    tjc's avatar
    tjc committed
    
    
    tjc's avatar
    tjc committed
          idx.add(idx2);
        }
        return idx;
      }
      
    
    tjc's avatar
    tjc committed
      protected long queryIndex(List<BCFIndex> idx, int tid, int beg)
    
    tjc's avatar
    tjc committed
      {
        long min_off = -1;
        if (beg < 0) 
          beg = 0;
       
        long offset[] = idx.get(tid).index2_offset;
        int i;
    
        for(i = beg>>TAD_LIDX_SHIFT; i < idx.get(tid).n && offset[i] == 0; ++i);
        min_off = (i == idx.get(tid).n)? offset[idx.get(tid).n-1] : offset[i];
           
        return min_off;
      }
      
    
    tjc's avatar
    tjc committed
      protected static int readInt(final InputStream is) throws IOException {
    
    tjc's avatar
    tjc committed
        byte[] buf = new byte[4];
        is.read(buf);
        return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getInt();
      }
      
      
    
    tjc's avatar
    tjc committed
      protected static float readFloat(final InputStream is) throws IOException {
    
    tjc's avatar
    tjc committed
        byte[] buf = new byte[4];
        is.read(buf);
        return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getFloat();
      }
    
    
    tjc's avatar
    tjc committed
      protected static long readLong(final InputStream is) throws IOException {
    
    tjc's avatar
    tjc committed
        byte[] buf = new byte[8];
        is.read(buf);
        return ByteBuffer.wrap(buf).order(ByteOrder.LITTLE_ENDIAN).getLong();
      }
    
      public static void main(String args[])
      {
        try
        {
    
    tjc's avatar
    tjc committed
          List<BCFIndex> idx = loadIndex(new File(args[0]));
    
    tjc's avatar
    tjc committed
          
    
    tjc's avatar
    tjc committed
          int sbeg = 0;
          int send = Integer.MAX_VALUE;
          if(args.length > 2)
    
    tjc's avatar
    tjc committed
          {
            sbeg = Integer.parseInt(args[2]);
            send = Integer.parseInt(args[3]);
          }
          
          BCFReader reader = new BCFReader();
    
    tjc's avatar
    tjc committed
          int bid = 0;
          long off = reader.queryIndex(idx, bid, sbeg);
          reader.query(new File(args[1]), off);
          
          VCFRecord bcfRecord;
          while( (bcfRecord = reader.next(bid, sbeg, send)) != null )
            System.out.println(bcfRecord.toString());
          
    
    tjc's avatar
    tjc committed
        }
        catch (IOException e)
        {
          // TODO Auto-generated catch block
          e.printStackTrace();
        }
      }
    }
    
    
    
    tjc's avatar
    tjc committed
    class BCFIndex
    
    tjc's avatar
    tjc committed
    {
      int n;
      long index2_offset[];
    }