Пример #1
0
  List<VcfEntry> readEntries(int idx) {
    // Cached?
    if (cachedLeafNodeIdx == idx) return cachedLeafNode;
    List<VcfEntry> vcfEntries = intersect[idx];
    if (vcfEntries != null) return vcfEntries;

    try {
      // There might be several non-contiguous file regions
      int len = intersectFilePosStart[idx].length;

      // Read each file region
      vcfEntries = new ArrayList<VcfEntry>();
      Set<VcfEntry> added = new HashSet<>();
      for (int i = 0; i < len; i++) {
        if (debug)
          Gpr.debug(
              "\tintersect["
                  + idx
                  + "]["
                  + i
                  + "]:\t["
                  + intersectFilePosStart[idx][i]
                  + " , "
                  + intersectFilePosEnd[idx][i]
                  + " ]");

        long startPos = intersectFilePosStart[idx][i];
        long endPos = intersectFilePosEnd[idx][i];

        // No cache? Read from file
        vcf.seek(startPos);

        // Read entries from file
        for (VcfEntry ve : vcf) {
          if (added.add(ve)) { // Make sure we add entries only once
            vcfEntries.add(ve);
            if (debug) Gpr.debug("\tParsing VcfEntry [" + vcf.getFilePointer() + "]: " + ve);
          }

          // Finished reading?
          if (vcf.getFilePointer() >= endPos) break;
        }
      }

      // Cache data
      if (isLeaf(idx)) {
        cachedLeafNodeIdx = idx;
        cachedLeafNode = vcfEntries;
      } else if (intersect[idx] == null) {
        // Cache non-leaf nodes, which have very few intersect entries
        intersect[idx] = vcfEntries;
      }

      return vcfEntries;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
Пример #2
0
  /**
   * Is this variant a duplication
   *
   * <p>Reference: http://www.hgvs.org/mutnomen/disc.html#dupins ...the description "dup" (see
   * Standards) may by definition only be used when the additional copy is directly 3'-flanking of
   * the original copy (tandem duplication)
   */
  protected boolean isDuplication() {

    // ---
    // Simple duplications can be obtained by looking into AA.Ref / AA.Alt
    // ---
    String aaRef = variantEffect.getAaRef().toUpperCase();
    String aaAlt = variantEffect.getAaAlt().toUpperCase();

    // Compare to ALT sequence
    String dupAaRef = aaRef + aaRef;
    if (debug) Gpr.debug("AA.Ref: '" + aaRef + "'\tAA.Alt: '" + aaAlt);
    if (aaAlt.equals(dupAaRef)) return true;

    // ---
    // Duplications need to look into the protein sequence
    // ---

    // Extract sequence from genomic coordinates before variant
    String protein = tr.protein();
    if (protein == null) return false; // Cannot calculate duplication

    // Calculate net amino acid change
    aaAlt = variantEffect.getAaNetChange();

    // Get previous AA sequence
    int aaEnd = variantEffect.getCodonNum();
    int aaStart = aaEnd - aaAlt.length();
    if (aaStart < 0 || aaEnd > protein.length()) return false;
    aaRef = protein.substring(aaStart, aaEnd);

    // Compare to ALT sequence
    boolean dup = aaRef.equalsIgnoreCase(aaAlt);
    if (debug)
      Gpr.debug(
          "SEQUENCE [ "
              + aaStart
              + " , "
              + aaEnd
              + " ]: '"
              + aaRef
              + "'" //
              + "\n\tAA Ref       : '"
              + variantEffect.getAaRef()
              + "'" //
              + "\n\tAA Alt       : '"
              + variantEffect.getAaAlt()
              + "'" //
              + "\n\tAA Alt (net) : '"
              + aaAlt
              + "'" //
              + "\n\tDup?         : "
              + dup);

    return dup;
  }
Пример #3
0
  /** Annotate a VCF entry */
  public boolean annotate(VcfEntry vcfEntry) throws IOException {
    boolean annotated = false;
    Set<String> idSet = new HashSet<>();
    Map<String, String> infos = new HashMap<>();
    boolean exists = false;

    // ---
    // Find all matching database entries
    // Note that QueryResult.variantVcfEntry can be 'null'
    // ---
    List<QueryResult> queryResults = new LinkedList<>();
    Set<VcfEntry> uniqueVcfEntries = new HashSet<>();
    for (Variant var : vcfEntry.variants()) {
      // Skip huge structural variants
      if (var.isStructuralHuge()) continue;

      // Query database
      Collection<VariantVcfEntry> results = query(var);

      // Make sure we add all found VcfEntries
      for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry());

      // Add query and result
      QueryResult qr = new QueryResult(var, results);
      queryResults.add(qr);
      if (debug) Gpr.debug("Adding QueryResult: " + qr);
    }

    // Try to find INFO fields that we might have not seen before
    if (useAllInfoFields) {
      for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve);
    }

    // Add INFO fields using 'REF' data
    findDbInfoRef(infos, uniqueVcfEntries);

    // ---
    // Annotate all fields
    // ---
    for (QueryResult qr : queryResults) {
      if (debug) Gpr.debug("Processing QueryResult: " + qr);

      if (useId) findDbId(idSet, qr);
      if (existsInfoField != null) exists |= findDbExists(qr);
      if (useInfoFields) findDbInfo(infos, qr);
    }

    // Annotate input vcfEntry
    annotated |= annotateIds(vcfEntry, idSet);
    annotated |= annotateInfo(vcfEntry, infos);
    if (exists) annotateExists(vcfEntry);

    return annotated;
  }
Пример #4
0
 /** Add a value to INFO hash for field 'infoFieldName' */
 void findDbInfoAddValue(Map<String, String> info, String infoFieldName, String newValue) {
   if (newValue == null && !annotateEmpty) return;
   if (debug) Gpr.debug("\tINFO:" + infoFieldName + "\tnewValue: " + newValue);
   String oldValue = info.get(infoFieldName);
   String val = (oldValue == null ? "" : oldValue + ",") + (newValue != null ? newValue : ".");
   info.put(infoFieldName, val);
 }
Пример #5
0
  /** Variant after exon end */
  @Test
  public void test_apply_variant_09() {
    Gpr.debug("Test");

    Variant variant = new Variant(transcript.getParent(), 410, "A", "T");
    checkApplySnp(variant, transcript.cds(), transcript.protein(), 1, 300, 399);
  }
Пример #6
0
 /** Rare Amino acid */
 @Test
 public void test_30_RareAa() {
   Gpr.debug("Test");
   String genomeName = "testHg3765Chr22";
   CompareEffects comp = new CompareEffects(genomeName, randSeed, verbose);
   comp.snpEffect("tests/rareAa.txt", null, true);
 }
Пример #7
0
  /** Find all non-empty INFO fields 'infoFieldName' in results */
  protected String findDbInfo(String infoFieldName, QueryResult qr) {
    if (debug) Gpr.debug("Finding DB data for INFO field: " + infoFieldName);
    StringBuilder sb = new StringBuilder();

    for (VariantVcfEntry varVe : qr.results) {
      if (varVe != null) {
        String val = varVe.getVcfEntry().getInfo(infoFieldName);
        if (!VcfEntry.isEmpty(val)) {
          if (debug) Gpr.debug("\tFound: " + val);
          if (sb.length() > 0) sb.append(',');
          sb.append(val);
        }
      }
    }

    return sb.length() <= 0 ? null : sb.toString();
  }
Пример #8
0
  /** Parse a 'line' from a 'vcfFileIterator' */
  public void parse() {
    // Parse line
    String fields[] =
        line.split("\t", 10); // Only pare the fist 9 fields (i.e. do not parse genotypes)

    // Is line OK?
    if (fields.length >= 4) {
      // Chromosome and position. VCF files are one-base, so inOffset should be 1.
      chromosomeName = fields[0].trim();

      // Chromosome
      Chromosome chromo = vcfFileIterator.getChromosome(chromosomeName);
      parent = chromo;
      vcfFileIterator.sanityCheckChromo(chromosomeName, chromo); // Sanity check

      // Start
      start = vcfFileIterator.parsePosition(vcfFileIterator.readField(fields, 1));

      // ID (e.g. might indicate dbSnp)
      id = vcfFileIterator.readField(fields, 2);

      // REF
      ref = vcfFileIterator.readField(fields, 3).toUpperCase(); // Reference and change
      strandMinus = false; // Strand is always positive (defined in VCF spec.)

      // ALT
      altStr = vcfFileIterator.readField(fields, 4).toUpperCase();
      parseAlts(altStr);

      // Quality
      String qStr = vcfFileIterator.readField(fields, 5);
      if (!qStr.isEmpty()) quality = Gpr.parseDoubleSafe(qStr);
      else quality = null;

      // Filter
      filter = vcfFileIterator.readField(fields, 6); // Filter parameters

      // INFO fields
      infoStr = vcfFileIterator.readField(fields, 7);
      info = null;

      // Start & End coordinates are anchored to the reference genome, thus based on REF field (ALT
      // is not taken into account)
      parseEnd(altStr);

      // Genotype format
      format = null;
      if (fields.length > 8)
        format =
            vcfFileIterator.readField(
                fields, 8); // This field is optional, So it can be null or EMPTY ('.')

      // Add genotype fields (lazy parse)
      if (fields.length > 9) genotypeFieldsStr = fields[9];
    } else
      throw new RuntimeException(
          "Impropper VCF entry: Not enough fields (missing tab separators?).\n" + line);
  }
Пример #9
0
  /** Query database and find results matching 'variant' */
  protected Collection<VariantVcfEntry> query(Variant variant) {
    // Query database
    Collection<VariantVcfEntry> results = dbVcf.query(variant);

    // Filter results to match 'variant'
    List<VariantVcfEntry> list = new LinkedList<>();
    for (VariantVcfEntry dbEntry : results) {
      if (match(variant, dbEntry)) {
        if (debug)
          Gpr.debug("dbEntry matches query\tvariant: " + variant + "\tdbEntry: " + dbEntry);
        list.add(dbEntry);
      } else {
        if (debug)
          Gpr.debug("dbEntry does NOT match query\tvariant: " + variant + "\tdbEntry: " + dbEntry);
      }
    }

    if (debug) Gpr.debug("Match query results: " + list.size());
    return list;
  }
Пример #10
0
  /** Parse genotype string (sparse matrix) and set all entries using 'value' */
  void parseSparseGt(String str, byte gt[], int valueInt) {
    if ((str == null) || (str.isEmpty()) || (str.equals("true"))) return;

    // Split comma separated indeces
    String idxs[] = str.split(",");
    byte value = (byte) valueInt;

    // Set all entries
    for (String idx : idxs) {
      int i = Gpr.parseIntSafe(idx);
      gt[i] = value;
    }
  }
Пример #11
0
  /** Query VCF entries intersecting 'marker' at node 'idx' */
  protected void queryIntersects(Interval queryMarker, int idx, Markers results) {
    if (intersectFilePosStart[idx] == null) return;
    if (debug) Gpr.debug("queryIntersects\tidx: " + idx);

    // Read entries from disk
    List<VcfEntry> vcfEntries = readEntries(idx);

    // Find matching entries
    for (VcfEntry ve : vcfEntries) {
      // If any variant within the vcfEntry intersects the query
      // marker, we store this VCF entry as a result
      for (Variant var : ve.variants()) {
        if (var.intersects(queryMarker)) {
          if (debug) Gpr.debug("\tAdding matching result: " + ve);
          results.add(ve);
          break; // Store this entry only once
        }
      }

      // Past query's end coordinate? We don't need to look any further
      if (queryMarker.getEnd() < ve.getStart()) return;
    }
  }
Пример #12
0
 @Override
 public String toString() {
   return "Annotate VCF db:\n" //
       + "\n\tannotateEmpty        :"
       + annotateEmpty //
       + "\n\texistsInfoField      :"
       + existsInfoField //
       + "\n\tprependInfoFieldName :"
       + prependInfoFieldName //
       + "\n\tuseRefAlt            :"
       + useRefAlt //
       + "\n\tdbVcf:\n"
       + Gpr.prependEachLine("\t\t", dbVcf) //
   ;
 }
Пример #13
0
  /** Variant right before exon end */
  @Test
  public void test_apply_variant_06() {
    Gpr.debug("Test");

    Variant variant = new Variant(transcript.getParent(), 399, "G", "A");

    String expectedCds =
        "atgtccgcaggtgaaggcatacacgctgcgcgtatactgatgttacctcgatggattttgtcagaaatatggtgcccaggacgcgaagggcatattatgg" // Exon[0]
            + "tgtttgggaattcacgggcacggttctgcagcaagctgaattggcagctcggcataaatcccgaccccatcgtcacgcacggatcaattcatcctcaacA"
                .toLowerCase() // Exon[1]
            + "ggtagaggaaaagcacctaacccccattgagcaggatctctttcgtaatactctgtatcgattaccgatttatttgattccccacatttatttcatcggg" // Exon[2]
        ;

    checkApplySnp(variant, expectedCds, null, 1, 300, 399);
  }
Пример #14
0
 /** Query index to find all VCF entries intersecting 'marker' Store VCF entries in 'results' */
 @Override
 public Markers query(Interval queryMarker) {
   Markers results = new Markers();
   if (debug)
     Gpr.debug(
         "Query: "
             + queryMarker.getChromosomeName()
             + ":"
             + queryMarker.getStart()
             + "-"
             + queryMarker.getEnd()
             + "\t"
             + queryMarker);
   query(queryMarker, 0, results);
   return results;
 }
Пример #15
0
  void add(String trId, String seq, int lineNum, boolean check) {
    // Repeated transcript Id? => Check that Protein is the same
    if (check && (proteinByTrId.get(trId) != null) && (!proteinByTrId.get(trId).equals(seq))) //
    System.err.println(
          "ERROR: Different protein for the same transcript ID. This should never happen!!!" //
              + "\n\tLine number   : "
              + lineNum //
              + "\n\tTranscript ID : '"
              + trId
              + "'" //
              + "\n\tProtein       : "
              + proteinByTrId.get(trId) //
              + "\n\tProtein (new) : "
              + seq //
          );

    // Use whole trId
    proteinByTrId.put(trId, seq); // Add it to the hash
    if (debug) Gpr.debug("Adding proteinByTrId{'" + trId + "'} :\t" + seq);
  }
Пример #16
0
  /**
   * Query index to find all VCF entries intersecting 'marker', starting from node 'idx' Store VCF
   * entries in 'results'
   */
  protected void query(Interval queryMarker, int idx, Markers results) {
    // Negative index? Nothing to do
    if (idx < 0) return;

    if (debug)
      Gpr.debug("Node: " + toString(idx) + (results.isEmpty() ? "" : "\n\tResults: " + results));

    // Check all intervals intersecting
    queryIntersects(queryMarker, idx, results);

    // Recurse left or right
    int midPos = mid[idx];
    if ((queryMarker.getEnd() < midPos) && (left[idx] >= 0)) {
      query(queryMarker, left[idx], results);
    }

    if ((midPos < queryMarker.getStart()) && (right[idx] >= 0)) {
      query(queryMarker, right[idx], results);
    }
  }
Пример #17
0
  /**
   * Read Proteins from a file Format: Tab-separated format, containing "sequence \t transcriptId"
   */
  void readProteinFileTxt() {
    // Load file
    String proteinData = Gpr.readFile(proteinFile);
    String proteinLines[] = proteinData.split("\n");

    // Parse each line
    int lineNum = 1;
    for (String proteinLine : proteinLines) {
      // Split tab separated fields
      String field[] = proteinLine.split("\\s+");

      // Parse fields
      if (field.length >= 2) {
        // OK Parse fields
        String seq = field[1].trim();
        String trId = field[0].trim();

        add(trId, seq, lineNum, true);
      }

      lineNum++;
    }
  }
Пример #18
0
  /** Read sequences from features file */
  void readProteinFileFeatures(FeaturesFile featuresFile) {
    for (Features features : featuresFile) {
      String trIdPrev = null;

      for (Feature f : features.getFeatures()) { // Find all CDS
        if (f.getType() == Type.GENE) {
          // Clean up trId
          trIdPrev = null;
        } else if (f.getType() == Type.MRNA) {
          // Save trId, so that next CDS record can find it
          trIdPrev = f.getTranscriptId();
        } else if (f.getType() == Type.CDS) { // Add CDS 'translation' record
          // Try using the transcript ID found in the previous record
          String trId = trIdPrev;
          if (trId == null) trId = f.getTranscriptId();

          String seq = f.getAasequence();

          if (debug) Gpr.debug(trId + "\t" + seq);
          if ((trId != null) && (seq != null)) add(trId, seq, -1, true);
        }
      }
    }
  }
Пример #19
0
  public String toString(
      String tabs, double thresholdEntropy, double thresholdP, int thresholdCount) {
    if (getTotalCount() == 0) return "";

    StringBuilder sb = new StringBuilder();
    double p[] = p();
    for (int idx = 0; idx < 4; idx++) {
      char base = BASES[idx];
      AcgtTree n = nodes[idx];
      if (n != null) {
        sb.append(
            String.format(
                "%s%s%s: %d\te:%4.3f\tp:%4.2f\n",
                tabs, name, base, counts[idx], n.entropy(), p[idx]));

        if (((n.entropy() <= thresholdEntropy) || (p[idx] >= thresholdP)) //
            && (counts[idx] >= thresholdCount) //
        ) {
          Gpr.debug(
              "Name:"
                  + n.name
                  + "\tIdx:"
                  + +idx
                  + "\tEntropy: "
                  + n.entropy()
                  + "\tP:"
                  + p[idx]
                  + "\tCount:"
                  + counts[idx]);
          sb.append(n.toString(tabs + "\t", thresholdEntropy, thresholdP, thresholdCount));
        }
      }
    }

    return sb.toString();
  }
  /** Read and parse genes file */
  protected void readRefSeqFile() {
    try {
      int count = 0;
      BufferedReader reader = Gpr.reader(fileName);
      if (reader == null) return; // Error

      for (lineNum = 1; reader.ready(); lineNum++) {
        line = reader.readLine();

        // Skip headers
        if (!line.startsWith("#")) {
          String fields[] = line.split("\t");

          if (fields.length >= 9) {
            // Parse fields
            int fieldNum = 0;
            String id = fields[fieldNum++];
            String chromoName = fields[fieldNum++];
            boolean strandMinus = fields[fieldNum++].equals("-");

            int txstart = parsePosition(fields[fieldNum++]);
            int txend =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int cdsStart = parsePosition(fields[fieldNum++]);
            int cdsEnd =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int exonCount = Gpr.parseIntSafe(fields[fieldNum++]);
            String exonStarts = fields[fieldNum++];
            String exonEnds =
                fields[
                    fieldNum++]; // Our internal database representations of coordinates always have
            // a zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            String proteinId = fields[fieldNum++];
            // String alignId = fields[fieldNum++]; // Not used

            // ---
            // Create
            // ----
            Chromosome chromo = getOrCreateChromosome(chromoName);

            // Is it protein coding?
            boolean isCoding = !proteinId.isEmpty(); // Protein ID assigned?

            // Create IDs
            String trId = uniqueTrId(id);

            // Get or create gene
            Gene gene =
                findOrCreateGene(proteinId, trId, chromo, txstart, txend, strandMinus, isCoding);

            // Create transcript
            Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId);
            tr.setProteinCoding(isCoding);
            add(tr);

            // Add Exons and CDS
            String exStartStr[] = exonStarts.split(",");
            String exEndStr[] = exonEnds.split(",");
            for (int i = 0; i < exonCount; i++) {
              // Exons
              int exStart = parsePosition(exStartStr[i]);
              int exEnd =
                  parsePosition(exEndStr[i])
                      - 1; // Our internal database representations of coordinates always have a
              // zero-based start and a one-based end (Reference:
              // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
              String exId = trId + ".ex." + (i + 1);
              Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i);
              add(ex);

              // CDS (ony if intersects)
              if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) {
                Cds cds =
                    new Cds(
                        tr,
                        Math.max(cdsStart, exStart),
                        Math.min(cdsEnd, exEnd),
                        strandMinus,
                        exId);
                add(cds);
              }
            }

            count++;
            if (count % MARK == 0) System.out.print('.');
            if (count % (100 * MARK) == 0) System.out.print("\n\t");
          }
        }
      }

      reader.close();
    } catch (Exception e) {
      Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'");
      throw new RuntimeException(e);
    }
  }
Пример #21
0
 /**
  * Get info field as a 'double' number The norm specifies data type as 'FLOAT', that is why the
  * name of this method might be not intuitive
  */
 public double getInfoFloat(String key) {
   if (info == null) parseInfo();
   String f = info.get(key);
   if (f == null) return Double.NaN;
   return Gpr.parseDoubleSafe(f);
 }
Пример #22
0
  /** Create a variant */
  List<Variant> variants(Chromosome chromo, int start, String reference, String alt, String id) {
    List<Variant> list = null;
    if (alt != null) alt = alt.toUpperCase();

    if (alt == null || alt.isEmpty() || alt.equals(reference)) {
      // Non-variant
      list = Variant.factory(chromo, start, reference, null, id, false);
    } else if (alt.charAt(0) == '<') {
      // Structural variants
      if (alt.startsWith("<DEL")) {
        // Case: Deletion
        // 2 321682    .  T   <DEL>         6     PASS
        // IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62
        String ch = ref;
        int startNew = start;

        if (end > start) {
          startNew = start + reference.length();
          int size = end - startNew + 1;
          char change[] = new char[size];
          for (int i = 0; i < change.length; i++)
            change[i] = reference.length() > i ? reference.charAt(i) : 'N';
          ch = new String(change);
        }
        list = Variant.factory(chromo, startNew, ch, "", id, false);
      } else if (alt.startsWith("<INV")) {
        // Inversion
        int startNew = start + reference.length();
        Variant var = new Variant(chromo, startNew, end, id);
        var.setVariantType(VariantType.INV);
        list = new LinkedList<>();
        list.add(var);
      } else if (alt.startsWith("<DUP")) {
        // Duplication
        int startNew = start + reference.length();
        Variant var = new Variant(chromo, startNew, end, id);
        var.setVariantType(VariantType.DUP);
        list = new LinkedList<>();
        list.add(var);
      }
    } else if ((alt.indexOf('[') >= 0) || (alt.indexOf(']') >= 0)) {
      // Translocations

      // Parse ALT string
      boolean left = alt.indexOf(']') >= 0;
      String sep = (left ? "\\]" : "\\[");
      String tpos[] = alt.split(sep);
      String pos = tpos[1];
      boolean before = (alt.indexOf(']') == 0) || (alt.indexOf('[') == 0);
      String altBases = (before ? tpos[2] : tpos[0]);

      // Parse 'chr:start'
      String posSplit[] = pos.split(":");
      String trChrName = posSplit[0];
      Chromosome trChr = chromo.getGenome().getOrCreateChromosome(trChrName);
      int trStart = Gpr.parseIntSafe(posSplit[1]) - 1;

      VariantBnd var = new VariantBnd(chromo, start, ref, altBases, trChr, trStart, left, before);
      list = new LinkedList<>();
      list.add(var);
    } else if (reference.length() == alt.length()) {
      // Case: SNP, MNP
      if (reference.length() == 1) {
        // SNPs
        // 20     3 .         C      G       .   PASS  DP=100
        list = Variant.factory(chromo, start, reference, alt, id, true);
      } else {
        // MNPs
        // 20     3 .         TC     AT      .   PASS  DP=100
        // Sometimes the first bases are the same and we can trim them
        int startDiff = Integer.MAX_VALUE;
        for (int i = 0; i < reference.length(); i++)
          if (reference.charAt(i) != alt.charAt(i)) startDiff = Math.min(startDiff, i);

        // MNPs
        // Sometimes the last bases are the same and we can trim them
        int endDiff = 0;
        for (int i = reference.length() - 1; i >= 0; i--)
          if (reference.charAt(i) != alt.charAt(i)) endDiff = Math.max(endDiff, i);

        String newRef = reference.substring(startDiff, endDiff + 1);
        String newAlt = alt.substring(startDiff, endDiff + 1);
        list = Variant.factory(chromo, start + startDiff, newRef, newAlt, id, true);
      }
    } else {
      // Short Insertions, Deletions or Mixed Variants (substitutions)
      VcfRefAltAlign align = new VcfRefAltAlign(alt, reference);
      align.align();
      int startDiff = align.getOffset();

      switch (align.getVariantType()) {
        case DEL:
          // Case: Deletion
          // 20     2 .         TC      T      .   PASS  DP=100
          // 20     2 .         AGAC    AAC    .   PASS  DP=100
          String ref = "";
          String ch = align.getAlignment();
          if (!ch.startsWith("-"))
            throw new RuntimeException(
                "Deletion '" + ch + "' does not start with '-'. This should never happen!");
          list = Variant.factory(chromo, start + startDiff, ref, ch, id, true);
          break;

        case INS:
          // Case: Insertion of A { tC ; tCA } tC is the reference allele
          // 20     2 .         TC      TCA    .   PASS  DP=100
          ch = align.getAlignment();
          ref = "";
          if (!ch.startsWith("+"))
            throw new RuntimeException(
                "Insertion '" + ch + "' does not start with '+'. This should never happen!");
          list = Variant.factory(chromo, start + startDiff, ref, ch, id, true);
          break;

        case MIXED:
          // Case: Mixed variant (substitution)
          reference = reference.substring(startDiff);
          alt = alt.substring(startDiff);
          list = Variant.factory(chromo, start + startDiff, reference, alt, id, true);
          break;

        default:
          // Other change type?
          throw new RuntimeException(
              "Unsupported VCF change type '"
                  + align.getVariantType()
                  + "'\n\tRef: "
                  + reference
                  + "'\n\tAlt: '"
                  + alt
                  + "'\n\tVcfEntry: "
                  + this);
      }
    }

    // ---
    // Add original 'ALT' field as genotype
    // ---
    if (list == null) list = new LinkedList<>();
    for (Variant variant : list) variant.setGenotype(alt);

    return list;
  }
Пример #23
0
 /**
  * Get info field as an long number The norm specifies data type as 'INT', that is why the name of
  * this method might be not intuitive
  */
 public long getInfoInt(String key) {
   if (info == null) parseInfo();
   String i = info.get(key);
   if (i == null) return 0;
   return Gpr.parseLongSafe(i);
 }