Пример #1
0
  List<VcfEntry> readEntries(int idx) {
    // Cached?
    if (cachedLeafNodeIdx == idx) return cachedLeafNode;
    List<VcfEntry> vcfEntries = intersect[idx];
    if (vcfEntries != null) return vcfEntries;

    try {
      // There might be several non-contiguous file regions
      int len = intersectFilePosStart[idx].length;

      // Read each file region
      vcfEntries = new ArrayList<VcfEntry>();
      Set<VcfEntry> added = new HashSet<>();
      for (int i = 0; i < len; i++) {
        if (debug)
          Gpr.debug(
              "\tintersect["
                  + idx
                  + "]["
                  + i
                  + "]:\t["
                  + intersectFilePosStart[idx][i]
                  + " , "
                  + intersectFilePosEnd[idx][i]
                  + " ]");

        long startPos = intersectFilePosStart[idx][i];
        long endPos = intersectFilePosEnd[idx][i];

        // No cache? Read from file
        vcf.seek(startPos);

        // Read entries from file
        for (VcfEntry ve : vcf) {
          if (added.add(ve)) { // Make sure we add entries only once
            vcfEntries.add(ve);
            if (debug) Gpr.debug("\tParsing VcfEntry [" + vcf.getFilePointer() + "]: " + ve);
          }

          // Finished reading?
          if (vcf.getFilePointer() >= endPos) break;
        }
      }

      // Cache data
      if (isLeaf(idx)) {
        cachedLeafNodeIdx = idx;
        cachedLeafNode = vcfEntries;
      } else if (intersect[idx] == null) {
        // Cache non-leaf nodes, which have very few intersect entries
        intersect[idx] = vcfEntries;
      }

      return vcfEntries;
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
Пример #2
0
  /**
   * Is this variant a duplication
   *
   * <p>Reference: http://www.hgvs.org/mutnomen/disc.html#dupins ...the description "dup" (see
   * Standards) may by definition only be used when the additional copy is directly 3'-flanking of
   * the original copy (tandem duplication)
   */
  protected boolean isDuplication() {

    // ---
    // Simple duplications can be obtained by looking into AA.Ref / AA.Alt
    // ---
    String aaRef = variantEffect.getAaRef().toUpperCase();
    String aaAlt = variantEffect.getAaAlt().toUpperCase();

    // Compare to ALT sequence
    String dupAaRef = aaRef + aaRef;
    if (debug) Gpr.debug("AA.Ref: '" + aaRef + "'\tAA.Alt: '" + aaAlt);
    if (aaAlt.equals(dupAaRef)) return true;

    // ---
    // Duplications need to look into the protein sequence
    // ---

    // Extract sequence from genomic coordinates before variant
    String protein = tr.protein();
    if (protein == null) return false; // Cannot calculate duplication

    // Calculate net amino acid change
    aaAlt = variantEffect.getAaNetChange();

    // Get previous AA sequence
    int aaEnd = variantEffect.getCodonNum();
    int aaStart = aaEnd - aaAlt.length();
    if (aaStart < 0 || aaEnd > protein.length()) return false;
    aaRef = protein.substring(aaStart, aaEnd);

    // Compare to ALT sequence
    boolean dup = aaRef.equalsIgnoreCase(aaAlt);
    if (debug)
      Gpr.debug(
          "SEQUENCE [ "
              + aaStart
              + " , "
              + aaEnd
              + " ]: '"
              + aaRef
              + "'" //
              + "\n\tAA Ref       : '"
              + variantEffect.getAaRef()
              + "'" //
              + "\n\tAA Alt       : '"
              + variantEffect.getAaAlt()
              + "'" //
              + "\n\tAA Alt (net) : '"
              + aaAlt
              + "'" //
              + "\n\tDup?         : "
              + dup);

    return dup;
  }
Пример #3
0
  /** Annotate a VCF entry */
  public boolean annotate(VcfEntry vcfEntry) throws IOException {
    boolean annotated = false;
    Set<String> idSet = new HashSet<>();
    Map<String, String> infos = new HashMap<>();
    boolean exists = false;

    // ---
    // Find all matching database entries
    // Note that QueryResult.variantVcfEntry can be 'null'
    // ---
    List<QueryResult> queryResults = new LinkedList<>();
    Set<VcfEntry> uniqueVcfEntries = new HashSet<>();
    for (Variant var : vcfEntry.variants()) {
      // Skip huge structural variants
      if (var.isStructuralHuge()) continue;

      // Query database
      Collection<VariantVcfEntry> results = query(var);

      // Make sure we add all found VcfEntries
      for (VariantVcfEntry dbEntry : results) uniqueVcfEntries.add(dbEntry.getVcfEntry());

      // Add query and result
      QueryResult qr = new QueryResult(var, results);
      queryResults.add(qr);
      if (debug) Gpr.debug("Adding QueryResult: " + qr);
    }

    // Try to find INFO fields that we might have not seen before
    if (useAllInfoFields) {
      for (VcfEntry ve : uniqueVcfEntries) discoverInfoFields(ve);
    }

    // Add INFO fields using 'REF' data
    findDbInfoRef(infos, uniqueVcfEntries);

    // ---
    // Annotate all fields
    // ---
    for (QueryResult qr : queryResults) {
      if (debug) Gpr.debug("Processing QueryResult: " + qr);

      if (useId) findDbId(idSet, qr);
      if (existsInfoField != null) exists |= findDbExists(qr);
      if (useInfoFields) findDbInfo(infos, qr);
    }

    // Annotate input vcfEntry
    annotated |= annotateIds(vcfEntry, idSet);
    annotated |= annotateInfo(vcfEntry, infos);
    if (exists) annotateExists(vcfEntry);

    return annotated;
  }
Пример #4
0
 /** Add a value to INFO hash for field 'infoFieldName' */
 void findDbInfoAddValue(Map<String, String> info, String infoFieldName, String newValue) {
   if (newValue == null && !annotateEmpty) return;
   if (debug) Gpr.debug("\tINFO:" + infoFieldName + "\tnewValue: " + newValue);
   String oldValue = info.get(infoFieldName);
   String val = (oldValue == null ? "" : oldValue + ",") + (newValue != null ? newValue : ".");
   info.put(infoFieldName, val);
 }
Пример #5
0
 /** Rare Amino acid */
 @Test
 public void test_30_RareAa() {
   Gpr.debug("Test");
   String genomeName = "testHg3765Chr22";
   CompareEffects comp = new CompareEffects(genomeName, randSeed, verbose);
   comp.snpEffect("tests/rareAa.txt", null, true);
 }
Пример #6
0
  /** Variant after exon end */
  @Test
  public void test_apply_variant_09() {
    Gpr.debug("Test");

    Variant variant = new Variant(transcript.getParent(), 410, "A", "T");
    checkApplySnp(variant, transcript.cds(), transcript.protein(), 1, 300, 399);
  }
Пример #7
0
  /** Find all non-empty INFO fields 'infoFieldName' in results */
  protected String findDbInfo(String infoFieldName, QueryResult qr) {
    if (debug) Gpr.debug("Finding DB data for INFO field: " + infoFieldName);
    StringBuilder sb = new StringBuilder();

    for (VariantVcfEntry varVe : qr.results) {
      if (varVe != null) {
        String val = varVe.getVcfEntry().getInfo(infoFieldName);
        if (!VcfEntry.isEmpty(val)) {
          if (debug) Gpr.debug("\tFound: " + val);
          if (sb.length() > 0) sb.append(',');
          sb.append(val);
        }
      }
    }

    return sb.length() <= 0 ? null : sb.toString();
  }
Пример #8
0
  /** Query database and find results matching 'variant' */
  protected Collection<VariantVcfEntry> query(Variant variant) {
    // Query database
    Collection<VariantVcfEntry> results = dbVcf.query(variant);

    // Filter results to match 'variant'
    List<VariantVcfEntry> list = new LinkedList<>();
    for (VariantVcfEntry dbEntry : results) {
      if (match(variant, dbEntry)) {
        if (debug)
          Gpr.debug("dbEntry matches query\tvariant: " + variant + "\tdbEntry: " + dbEntry);
        list.add(dbEntry);
      } else {
        if (debug)
          Gpr.debug("dbEntry does NOT match query\tvariant: " + variant + "\tdbEntry: " + dbEntry);
      }
    }

    if (debug) Gpr.debug("Match query results: " + list.size());
    return list;
  }
Пример #9
0
  /** Query VCF entries intersecting 'marker' at node 'idx' */
  protected void queryIntersects(Interval queryMarker, int idx, Markers results) {
    if (intersectFilePosStart[idx] == null) return;
    if (debug) Gpr.debug("queryIntersects\tidx: " + idx);

    // Read entries from disk
    List<VcfEntry> vcfEntries = readEntries(idx);

    // Find matching entries
    for (VcfEntry ve : vcfEntries) {
      // If any variant within the vcfEntry intersects the query
      // marker, we store this VCF entry as a result
      for (Variant var : ve.variants()) {
        if (var.intersects(queryMarker)) {
          if (debug) Gpr.debug("\tAdding matching result: " + ve);
          results.add(ve);
          break; // Store this entry only once
        }
      }

      // Past query's end coordinate? We don't need to look any further
      if (queryMarker.getEnd() < ve.getStart()) return;
    }
  }
Пример #10
0
  /** Variant right before exon end */
  @Test
  public void test_apply_variant_06() {
    Gpr.debug("Test");

    Variant variant = new Variant(transcript.getParent(), 399, "G", "A");

    String expectedCds =
        "atgtccgcaggtgaaggcatacacgctgcgcgtatactgatgttacctcgatggattttgtcagaaatatggtgcccaggacgcgaagggcatattatgg" // Exon[0]
            + "tgtttgggaattcacgggcacggttctgcagcaagctgaattggcagctcggcataaatcccgaccccatcgtcacgcacggatcaattcatcctcaacA"
                .toLowerCase() // Exon[1]
            + "ggtagaggaaaagcacctaacccccattgagcaggatctctttcgtaatactctgtatcgattaccgatttatttgattccccacatttatttcatcggg" // Exon[2]
        ;

    checkApplySnp(variant, expectedCds, null, 1, 300, 399);
  }
Пример #11
0
 /** Query index to find all VCF entries intersecting 'marker' Store VCF entries in 'results' */
 @Override
 public Markers query(Interval queryMarker) {
   Markers results = new Markers();
   if (debug)
     Gpr.debug(
         "Query: "
             + queryMarker.getChromosomeName()
             + ":"
             + queryMarker.getStart()
             + "-"
             + queryMarker.getEnd()
             + "\t"
             + queryMarker);
   query(queryMarker, 0, results);
   return results;
 }
Пример #12
0
  /**
   * Query index to find all VCF entries intersecting 'marker', starting from node 'idx' Store VCF
   * entries in 'results'
   */
  protected void query(Interval queryMarker, int idx, Markers results) {
    // Negative index? Nothing to do
    if (idx < 0) return;

    if (debug)
      Gpr.debug("Node: " + toString(idx) + (results.isEmpty() ? "" : "\n\tResults: " + results));

    // Check all intervals intersecting
    queryIntersects(queryMarker, idx, results);

    // Recurse left or right
    int midPos = mid[idx];
    if ((queryMarker.getEnd() < midPos) && (left[idx] >= 0)) {
      query(queryMarker, left[idx], results);
    }

    if ((midPos < queryMarker.getStart()) && (right[idx] >= 0)) {
      query(queryMarker, right[idx], results);
    }
  }
Пример #13
0
  void add(String trId, String seq, int lineNum, boolean check) {
    // Repeated transcript Id? => Check that Protein is the same
    if (check && (proteinByTrId.get(trId) != null) && (!proteinByTrId.get(trId).equals(seq))) //
    System.err.println(
          "ERROR: Different protein for the same transcript ID. This should never happen!!!" //
              + "\n\tLine number   : "
              + lineNum //
              + "\n\tTranscript ID : '"
              + trId
              + "'" //
              + "\n\tProtein       : "
              + proteinByTrId.get(trId) //
              + "\n\tProtein (new) : "
              + seq //
          );

    // Use whole trId
    proteinByTrId.put(trId, seq); // Add it to the hash
    if (debug) Gpr.debug("Adding proteinByTrId{'" + trId + "'} :\t" + seq);
  }
Пример #14
0
  /** Read sequences from features file */
  void readProteinFileFeatures(FeaturesFile featuresFile) {
    for (Features features : featuresFile) {
      String trIdPrev = null;

      for (Feature f : features.getFeatures()) { // Find all CDS
        if (f.getType() == Type.GENE) {
          // Clean up trId
          trIdPrev = null;
        } else if (f.getType() == Type.MRNA) {
          // Save trId, so that next CDS record can find it
          trIdPrev = f.getTranscriptId();
        } else if (f.getType() == Type.CDS) { // Add CDS 'translation' record
          // Try using the transcript ID found in the previous record
          String trId = trIdPrev;
          if (trId == null) trId = f.getTranscriptId();

          String seq = f.getAasequence();

          if (debug) Gpr.debug(trId + "\t" + seq);
          if ((trId != null) && (seq != null)) add(trId, seq, -1, true);
        }
      }
    }
  }
Пример #15
0
  public String toString(
      String tabs, double thresholdEntropy, double thresholdP, int thresholdCount) {
    if (getTotalCount() == 0) return "";

    StringBuilder sb = new StringBuilder();
    double p[] = p();
    for (int idx = 0; idx < 4; idx++) {
      char base = BASES[idx];
      AcgtTree n = nodes[idx];
      if (n != null) {
        sb.append(
            String.format(
                "%s%s%s: %d\te:%4.3f\tp:%4.2f\n",
                tabs, name, base, counts[idx], n.entropy(), p[idx]));

        if (((n.entropy() <= thresholdEntropy) || (p[idx] >= thresholdP)) //
            && (counts[idx] >= thresholdCount) //
        ) {
          Gpr.debug(
              "Name:"
                  + n.name
                  + "\tIdx:"
                  + +idx
                  + "\tEntropy: "
                  + n.entropy()
                  + "\tP:"
                  + p[idx]
                  + "\tCount:"
                  + counts[idx]);
          sb.append(n.toString(tabs + "\t", thresholdEntropy, thresholdP, thresholdCount));
        }
      }
    }

    return sb.toString();
  }
  /** Read and parse genes file */
  protected void readRefSeqFile() {
    try {
      int count = 0;
      BufferedReader reader = Gpr.reader(fileName);
      if (reader == null) return; // Error

      for (lineNum = 1; reader.ready(); lineNum++) {
        line = reader.readLine();

        // Skip headers
        if (!line.startsWith("#")) {
          String fields[] = line.split("\t");

          if (fields.length >= 9) {
            // Parse fields
            int fieldNum = 0;
            String id = fields[fieldNum++];
            String chromoName = fields[fieldNum++];
            boolean strandMinus = fields[fieldNum++].equals("-");

            int txstart = parsePosition(fields[fieldNum++]);
            int txend =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int cdsStart = parsePosition(fields[fieldNum++]);
            int cdsEnd =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int exonCount = Gpr.parseIntSafe(fields[fieldNum++]);
            String exonStarts = fields[fieldNum++];
            String exonEnds =
                fields[
                    fieldNum++]; // Our internal database representations of coordinates always have
            // a zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            String proteinId = fields[fieldNum++];
            // String alignId = fields[fieldNum++]; // Not used

            // ---
            // Create
            // ----
            Chromosome chromo = getOrCreateChromosome(chromoName);

            // Is it protein coding?
            boolean isCoding = !proteinId.isEmpty(); // Protein ID assigned?

            // Create IDs
            String trId = uniqueTrId(id);

            // Get or create gene
            Gene gene =
                findOrCreateGene(proteinId, trId, chromo, txstart, txend, strandMinus, isCoding);

            // Create transcript
            Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId);
            tr.setProteinCoding(isCoding);
            add(tr);

            // Add Exons and CDS
            String exStartStr[] = exonStarts.split(",");
            String exEndStr[] = exonEnds.split(",");
            for (int i = 0; i < exonCount; i++) {
              // Exons
              int exStart = parsePosition(exStartStr[i]);
              int exEnd =
                  parsePosition(exEndStr[i])
                      - 1; // Our internal database representations of coordinates always have a
              // zero-based start and a one-based end (Reference:
              // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
              String exId = trId + ".ex." + (i + 1);
              Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i);
              add(ex);

              // CDS (ony if intersects)
              if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) {
                Cds cds =
                    new Cds(
                        tr,
                        Math.max(cdsStart, exStart),
                        Math.min(cdsEnd, exEnd),
                        strandMinus,
                        exId);
                add(cds);
              }
            }

            count++;
            if (count % MARK == 0) System.out.print('.');
            if (count % (100 * MARK) == 0) System.out.print("\n\t");
          }
        }
      }

      reader.close();
    } catch (Exception e) {
      Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'");
      throw new RuntimeException(e);
    }
  }