コード例 #1
0
  static double addToFeatures(
      int totalMappedBases,
      Map<Feature, Integer> features,
      Map<String, FeatureCount> featureCounts) {
    assert (features != null);
    assert (featureCounts != null);

    if (features.size() == 0) {
      return 1.0;
    }

    /*
     * Count the total mapping
     * just going to use total mapped bases
     */
    // double total = 0.0;
    // for(Map.Entry<Feature, Integer> entry : features.entrySet()) {
    //	total += entry.getValue();
    // }
    double mapped = 0.0;
    for (Map.Entry<Feature, Integer> entry : features.entrySet()) {
      Feature feature = entry.getKey();
      double count = entry.getValue();
      mapped += count;
      String id = feature.getAttribute("id");
      if (id == null) {
        throw new IllegalArgumentException(
            "Feature must have id attribute\n" + GTFFeatureRenderer.render(feature));
      }
      FeatureCount fc = featureCounts.get(id);
      if (fc == null) {
        throw new IllegalStateException("No FeatureCount object found for id: " + id);
      }

      fc.addToCount(count / (double) totalMappedBases);
    }
    double result = ((double) totalMappedBases - mapped) / (double) totalMappedBases;
    if (result < -0.0001) {
      throw new IllegalStateException(
          "There was an issue counting reads...\nresult="
              + result
              + "\ntotalMappedBases="
              + totalMappedBases
              + "\nmapped="
              + mapped);
    }
    return result;
  }
コード例 #2
0
  @Override
  public void buildFeatures(File gtf, String type) throws IOException {

    // gene -> LinkedList<Feature>
    HashMap<String, LinkedList<Feature>> geneFeatures = new HashMap<>();

    if (!type.equals("exon")) {
      throw new UnsupportedOperationException("only exon counting is supported");
    }

    logger.info("Reading annotation gtf...");
    // Read GTF
    // Store all exons for each gene
    int totalFeatures = 0;
    int totalGenes = 0;
    int linesRead = 0;
    try (GTFReader reader = GTFReader.getGTFByFileNoEnsemblVersion(gtf)) {
      reader.addFilter(new GeneOrExonFilter()).addFilter(new RemoveRetainedIntronFilter());

      for (Feature feature : reader) {
        linesRead++;
        if (linesRead % 100000 == 0) {
          logger.info("Lines read: " + linesRead);
          logger.info("Last read feature: " + GTFFeatureRenderer.render(feature));
        }
        if (feature.type().equals("gene")) {
          geneInfo.put(feature.getAttribute("gene_id"), feature);
          totalGenes++;
        } else if (feature.type().equals("exon")) {
          totalFeatures++;
          String geneId = feature.getAttribute("gene_id");
          if (geneFeatures.containsKey(geneId)) {
            geneFeatures.get(geneId).add(feature);
          } else {
            LinkedList<Feature> features = new LinkedList<Feature>();
            features.add(feature);
            geneFeatures.put(geneId, features);
          }
        }
      }
    }

    logger.info("Total genes " + totalGenes);
    logger.info("Total exons " + totalFeatures);

    /*
     * Checking GTF Validity
     */
    logger.info("Validating that every exon has at least some gene entry from GTF file");
    for (String key : geneFeatures.keySet()) {
      if (!geneInfo.containsKey(key)) {
        throw new IllegalStateException(key + " does not have a gene entry in the GTF");
      }
    }
    logger.info("All gene ids found for exons");
    /*
     * sorting exons per gene
     */

    logger.info("Sorting features per gene...");
    // Sort features per chromosome
    for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) {
      GTFFeatureUtil.sortFeatures(entry.getValue());
    }

    /*
     * find overlapping features between genes
     */
    ArrayList<String> geneKeys = new ArrayList<String>(geneFeatures.keySet().size());
    geneKeys.addAll(geneFeatures.keySet());

    FindOverlappingExonsBetweenGenes findOverlappingExonsBetweenGenes =
        new FindOverlappingExonsBetweenGenes();

    logger.info("Finding overlapping exons between genes");
    HashSet<Feature> geneOverlapToRemove = new HashSet<>();
    FindOverlappingGenePairs findOverlappingGenePairs =
        new FindOverlappingGenePairs(geneInfo.values());
    List<FeaturePair> overlappingGenes = findOverlappingGenePairs.getOverlappingGenes();
    for (FeaturePair pair : overlappingGenes) {
      // logger.info("First of pair: " + pair.getFirst().getAttribute("gene_id"));
      // logger.info("Second of pair: " + pair.getSecond().getAttribute("gene_id"));
      List<Feature> gene1 = geneFeatures.get(pair.getFirst().getAttribute("gene_id"));
      List<Feature> gene2 = geneFeatures.get(pair.getSecond().getAttribute("gene_id"));
      // Some genes may have only retained introns...
      if (gene1 != null && gene1.size() > 0 && gene2 != null && gene2.size() > 0) {
        List<Feature> overlappingExons =
            findOverlappingExonsBetweenGenes.findOverlappingExons(gene1, gene2);
        if (overlappingExons.size() > 0) {
          geneOverlapToRemove.addAll(overlappingExons);
        }
      }
    }

    logger.info("Storing removed exons");
    featuresRemovedForAnalysis = geneOverlapToRemove;
    logger.info("Total overlapping features between genes: " + geneOverlapToRemove.size());

    /*
     * remove overlapping features between genes
     */
    int countRemovedFeatures = 0;
    logger.info("Removing overlapping exons per gene");
    for (String geneKey : geneKeys) {
      List<Feature> features = geneFeatures.get(geneKey);
      // logger.info("GENE " + geneKey + " has " + features.size());
      Iterator<Feature> iter = features.iterator();
      while (iter.hasNext()) {
        Feature f = iter.next();
        if (geneOverlapToRemove.contains(f)) {
          iter.remove();
          countRemovedFeatures++;
        }
      }
      // logger.info("GENE " + geneKey + " has " + features.size() + " after removing overlapping
      // exons");
    }
    logger.info("Total overlapping features actually removed: " + countRemovedFeatures);

    HashMap<String, LinkedList<Feature>> chromosomeFeaturesUnsorted = new HashMap<>();

    logger.info("Merging features across each gene and inserting into separate chromosome...");
    // Merge features per gene and store
    for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) {

      if (entry.getValue().size() == 0) {
        continue;
      }

      // GTFFeatureUtil.sortFeatures(entry.getValue());
      List<Feature> mergedFeatures =
          FeatureMerger.mergeOverlappingFeaturesIgnoringStrand(entry.getValue());

      /*
       * create feature counts and set ids on features
       */
      for (Feature feature : mergedFeatures) {
        FeatureCount fc = new FeatureCount(feature);
        String id = fc.getId();
        featureCounts.put(id, fc);
      }

      Feature first = mergedFeatures.get(0);
      String chr = first.seqname();
      /*
       * For stranded data add + or - to end of chr
       */

      if (chromosomeFeaturesUnsorted.containsKey(chr)) {
        chromosomeFeaturesUnsorted.get(chr).addAll(mergedFeatures);
      } else {
        LinkedList<Feature> chrFeatures = new LinkedList<>();
        chrFeatures.addAll(mergedFeatures);
        chromosomeFeaturesUnsorted.put(chr, chrFeatures);
      }
    }

    logger.info("Sorting features per chromosome...");
    // Sort features per chromosome
    for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) {
      GTFFeatureUtil.sortFeatures(entry.getValue());
    }

    logger.info("Validating that there are no overlapping intervals");

    for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) {
      logger.info("Validating that " + entry.getKey() + " has no overlap");
      if (!GTFFeatureUtil.hasNoOverlapIgnoreStrand(entry.getValue())) {
        throw new IllegalStateException(
            "Some intervals have overlap... There was a problem in the merging process");
      }
    }

    logger.info("Converting chomosome features to array for binary search...");
    // Store as array per chromosome for binary search
    for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) {
      Feature[] featureArray = new Feature[entry.getValue().size()];
      chromosomeFeatures.put(entry.getKey(), entry.getValue().toArray(featureArray));
    }
  }