Ejemplo n.º 1
0
  /** Parse genotype string (sparse matrix) and set all entries using 'value' */
  void parseSparseGt(String str, byte gt[], int valueInt) {
    if ((str == null) || (str.isEmpty()) || (str.equals("true"))) return;

    // Split comma separated indeces
    String idxs[] = str.split(",");
    byte value = (byte) valueInt;

    // Set all entries
    for (String idx : idxs) {
      int i = Gpr.parseIntSafe(idx);
      gt[i] = value;
    }
  }
  /** Read and parse genes file */
  protected void readRefSeqFile() {
    try {
      int count = 0;
      BufferedReader reader = Gpr.reader(fileName);
      if (reader == null) return; // Error

      for (lineNum = 1; reader.ready(); lineNum++) {
        line = reader.readLine();

        // Skip headers
        if (!line.startsWith("#")) {
          String fields[] = line.split("\t");

          if (fields.length >= 9) {
            // Parse fields
            int fieldNum = 0;
            String id = fields[fieldNum++];
            String chromoName = fields[fieldNum++];
            boolean strandMinus = fields[fieldNum++].equals("-");

            int txstart = parsePosition(fields[fieldNum++]);
            int txend =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int cdsStart = parsePosition(fields[fieldNum++]);
            int cdsEnd =
                parsePosition(fields[fieldNum++])
                    - 1; // Our internal database representations of coordinates always have a
            // zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            int exonCount = Gpr.parseIntSafe(fields[fieldNum++]);
            String exonStarts = fields[fieldNum++];
            String exonEnds =
                fields[
                    fieldNum++]; // Our internal database representations of coordinates always have
            // a zero-based start and a one-based end (Reference:
            // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )

            String proteinId = fields[fieldNum++];
            // String alignId = fields[fieldNum++]; // Not used

            // ---
            // Create
            // ----
            Chromosome chromo = getOrCreateChromosome(chromoName);

            // Is it protein coding?
            boolean isCoding = !proteinId.isEmpty(); // Protein ID assigned?

            // Create IDs
            String trId = uniqueTrId(id);

            // Get or create gene
            Gene gene =
                findOrCreateGene(proteinId, trId, chromo, txstart, txend, strandMinus, isCoding);

            // Create transcript
            Transcript tr = new Transcript(gene, txstart, txend, strandMinus, trId);
            tr.setProteinCoding(isCoding);
            add(tr);

            // Add Exons and CDS
            String exStartStr[] = exonStarts.split(",");
            String exEndStr[] = exonEnds.split(",");
            for (int i = 0; i < exonCount; i++) {
              // Exons
              int exStart = parsePosition(exStartStr[i]);
              int exEnd =
                  parsePosition(exEndStr[i])
                      - 1; // Our internal database representations of coordinates always have a
              // zero-based start and a one-based end (Reference:
              // http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 )
              String exId = trId + ".ex." + (i + 1);
              Exon ex = new Exon(tr, exStart, exEnd, strandMinus, exId, i);
              add(ex);

              // CDS (ony if intersects)
              if ((exStart <= cdsEnd) && (exEnd >= cdsStart)) {
                Cds cds =
                    new Cds(
                        tr,
                        Math.max(cdsStart, exStart),
                        Math.min(cdsEnd, exEnd),
                        strandMinus,
                        exId);
                add(cds);
              }
            }

            count++;
            if (count % MARK == 0) System.out.print('.');
            if (count % (100 * MARK) == 0) System.out.print("\n\t");
          }
        }
      }

      reader.close();
    } catch (Exception e) {
      Gpr.debug("Offending line (lineNum: " + lineNum + "): '" + line + "'");
      throw new RuntimeException(e);
    }
  }
Ejemplo n.º 3
0
  /** Create a variant */
  List<Variant> variants(Chromosome chromo, int start, String reference, String alt, String id) {
    List<Variant> list = null;
    if (alt != null) alt = alt.toUpperCase();

    if (alt == null || alt.isEmpty() || alt.equals(reference)) {
      // Non-variant
      list = Variant.factory(chromo, start, reference, null, id, false);
    } else if (alt.charAt(0) == '<') {
      // Structural variants
      if (alt.startsWith("<DEL")) {
        // Case: Deletion
        // 2 321682    .  T   <DEL>         6     PASS
        // IMPRECISE;SVTYPE=DEL;END=321887;SVLEN=-105;CIPOS=-56,20;CIEND=-10,62
        String ch = ref;
        int startNew = start;

        if (end > start) {
          startNew = start + reference.length();
          int size = end - startNew + 1;
          char change[] = new char[size];
          for (int i = 0; i < change.length; i++)
            change[i] = reference.length() > i ? reference.charAt(i) : 'N';
          ch = new String(change);
        }
        list = Variant.factory(chromo, startNew, ch, "", id, false);
      } else if (alt.startsWith("<INV")) {
        // Inversion
        int startNew = start + reference.length();
        Variant var = new Variant(chromo, startNew, end, id);
        var.setVariantType(VariantType.INV);
        list = new LinkedList<>();
        list.add(var);
      } else if (alt.startsWith("<DUP")) {
        // Duplication
        int startNew = start + reference.length();
        Variant var = new Variant(chromo, startNew, end, id);
        var.setVariantType(VariantType.DUP);
        list = new LinkedList<>();
        list.add(var);
      }
    } else if ((alt.indexOf('[') >= 0) || (alt.indexOf(']') >= 0)) {
      // Translocations

      // Parse ALT string
      boolean left = alt.indexOf(']') >= 0;
      String sep = (left ? "\\]" : "\\[");
      String tpos[] = alt.split(sep);
      String pos = tpos[1];
      boolean before = (alt.indexOf(']') == 0) || (alt.indexOf('[') == 0);
      String altBases = (before ? tpos[2] : tpos[0]);

      // Parse 'chr:start'
      String posSplit[] = pos.split(":");
      String trChrName = posSplit[0];
      Chromosome trChr = chromo.getGenome().getOrCreateChromosome(trChrName);
      int trStart = Gpr.parseIntSafe(posSplit[1]) - 1;

      VariantBnd var = new VariantBnd(chromo, start, ref, altBases, trChr, trStart, left, before);
      list = new LinkedList<>();
      list.add(var);
    } else if (reference.length() == alt.length()) {
      // Case: SNP, MNP
      if (reference.length() == 1) {
        // SNPs
        // 20     3 .         C      G       .   PASS  DP=100
        list = Variant.factory(chromo, start, reference, alt, id, true);
      } else {
        // MNPs
        // 20     3 .         TC     AT      .   PASS  DP=100
        // Sometimes the first bases are the same and we can trim them
        int startDiff = Integer.MAX_VALUE;
        for (int i = 0; i < reference.length(); i++)
          if (reference.charAt(i) != alt.charAt(i)) startDiff = Math.min(startDiff, i);

        // MNPs
        // Sometimes the last bases are the same and we can trim them
        int endDiff = 0;
        for (int i = reference.length() - 1; i >= 0; i--)
          if (reference.charAt(i) != alt.charAt(i)) endDiff = Math.max(endDiff, i);

        String newRef = reference.substring(startDiff, endDiff + 1);
        String newAlt = alt.substring(startDiff, endDiff + 1);
        list = Variant.factory(chromo, start + startDiff, newRef, newAlt, id, true);
      }
    } else {
      // Short Insertions, Deletions or Mixed Variants (substitutions)
      VcfRefAltAlign align = new VcfRefAltAlign(alt, reference);
      align.align();
      int startDiff = align.getOffset();

      switch (align.getVariantType()) {
        case DEL:
          // Case: Deletion
          // 20     2 .         TC      T      .   PASS  DP=100
          // 20     2 .         AGAC    AAC    .   PASS  DP=100
          String ref = "";
          String ch = align.getAlignment();
          if (!ch.startsWith("-"))
            throw new RuntimeException(
                "Deletion '" + ch + "' does not start with '-'. This should never happen!");
          list = Variant.factory(chromo, start + startDiff, ref, ch, id, true);
          break;

        case INS:
          // Case: Insertion of A { tC ; tCA } tC is the reference allele
          // 20     2 .         TC      TCA    .   PASS  DP=100
          ch = align.getAlignment();
          ref = "";
          if (!ch.startsWith("+"))
            throw new RuntimeException(
                "Insertion '" + ch + "' does not start with '+'. This should never happen!");
          list = Variant.factory(chromo, start + startDiff, ref, ch, id, true);
          break;

        case MIXED:
          // Case: Mixed variant (substitution)
          reference = reference.substring(startDiff);
          alt = alt.substring(startDiff);
          list = Variant.factory(chromo, start + startDiff, reference, alt, id, true);
          break;

        default:
          // Other change type?
          throw new RuntimeException(
              "Unsupported VCF change type '"
                  + align.getVariantType()
                  + "'\n\tRef: "
                  + reference
                  + "'\n\tAlt: '"
                  + alt
                  + "'\n\tVcfEntry: "
                  + this);
      }
    }

    // ---
    // Add original 'ALT' field as genotype
    // ---
    if (list == null) list = new LinkedList<>();
    for (Variant variant : list) variant.setGenotype(alt);

    return list;
  }