static double addToFeatures( int totalMappedBases, Map<Feature, Integer> features, Map<String, FeatureCount> featureCounts) { assert (features != null); assert (featureCounts != null); if (features.size() == 0) { return 1.0; } /* * Count the total mapping * just going to use total mapped bases */ // double total = 0.0; // for(Map.Entry<Feature, Integer> entry : features.entrySet()) { // total += entry.getValue(); // } double mapped = 0.0; for (Map.Entry<Feature, Integer> entry : features.entrySet()) { Feature feature = entry.getKey(); double count = entry.getValue(); mapped += count; String id = feature.getAttribute("id"); if (id == null) { throw new IllegalArgumentException( "Feature must have id attribute\n" + GTFFeatureRenderer.render(feature)); } FeatureCount fc = featureCounts.get(id); if (fc == null) { throw new IllegalStateException("No FeatureCount object found for id: " + id); } fc.addToCount(count / (double) totalMappedBases); } double result = ((double) totalMappedBases - mapped) / (double) totalMappedBases; if (result < -0.0001) { throw new IllegalStateException( "There was an issue counting reads...\nresult=" + result + "\ntotalMappedBases=" + totalMappedBases + "\nmapped=" + mapped); } return result; }
@Override public void buildFeatures(File gtf, String type) throws IOException { // gene -> LinkedList<Feature> HashMap<String, LinkedList<Feature>> geneFeatures = new HashMap<>(); if (!type.equals("exon")) { throw new UnsupportedOperationException("only exon counting is supported"); } logger.info("Reading annotation gtf..."); // Read GTF // Store all exons for each gene int totalFeatures = 0; int totalGenes = 0; int linesRead = 0; try (GTFReader reader = GTFReader.getGTFByFileNoEnsemblVersion(gtf)) { reader.addFilter(new GeneOrExonFilter()).addFilter(new RemoveRetainedIntronFilter()); for (Feature feature : reader) { linesRead++; if (linesRead % 100000 == 0) { logger.info("Lines read: " + linesRead); logger.info("Last read feature: " + GTFFeatureRenderer.render(feature)); } if (feature.type().equals("gene")) { geneInfo.put(feature.getAttribute("gene_id"), feature); totalGenes++; } else if (feature.type().equals("exon")) { totalFeatures++; String geneId = feature.getAttribute("gene_id"); if (geneFeatures.containsKey(geneId)) { geneFeatures.get(geneId).add(feature); } else { LinkedList<Feature> features = new LinkedList<Feature>(); features.add(feature); geneFeatures.put(geneId, features); } } } } logger.info("Total genes " + totalGenes); logger.info("Total exons " + totalFeatures); /* * Checking GTF Validity */ logger.info("Validating that every exon has at least some gene entry from GTF file"); for (String key : geneFeatures.keySet()) { if (!geneInfo.containsKey(key)) { throw new IllegalStateException(key + " does not have a gene entry in the GTF"); } } logger.info("All gene ids found for exons"); /* * sorting exons per gene */ logger.info("Sorting features per gene..."); // Sort features per chromosome for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) { GTFFeatureUtil.sortFeatures(entry.getValue()); } /* * find overlapping features between genes */ ArrayList<String> geneKeys = new ArrayList<String>(geneFeatures.keySet().size()); geneKeys.addAll(geneFeatures.keySet()); FindOverlappingExonsBetweenGenes findOverlappingExonsBetweenGenes = new FindOverlappingExonsBetweenGenes(); logger.info("Finding overlapping exons between genes"); HashSet<Feature> geneOverlapToRemove = new HashSet<>(); FindOverlappingGenePairs findOverlappingGenePairs = new FindOverlappingGenePairs(geneInfo.values()); List<FeaturePair> overlappingGenes = findOverlappingGenePairs.getOverlappingGenes(); for (FeaturePair pair : overlappingGenes) { // logger.info("First of pair: " + pair.getFirst().getAttribute("gene_id")); // logger.info("Second of pair: " + pair.getSecond().getAttribute("gene_id")); List<Feature> gene1 = geneFeatures.get(pair.getFirst().getAttribute("gene_id")); List<Feature> gene2 = geneFeatures.get(pair.getSecond().getAttribute("gene_id")); // Some genes may have only retained introns... if (gene1 != null && gene1.size() > 0 && gene2 != null && gene2.size() > 0) { List<Feature> overlappingExons = findOverlappingExonsBetweenGenes.findOverlappingExons(gene1, gene2); if (overlappingExons.size() > 0) { geneOverlapToRemove.addAll(overlappingExons); } } } logger.info("Storing removed exons"); featuresRemovedForAnalysis = geneOverlapToRemove; logger.info("Total overlapping features between genes: " + geneOverlapToRemove.size()); /* * remove overlapping features between genes */ int countRemovedFeatures = 0; logger.info("Removing overlapping exons per gene"); for (String geneKey : geneKeys) { List<Feature> features = geneFeatures.get(geneKey); // logger.info("GENE " + geneKey + " has " + features.size()); Iterator<Feature> iter = features.iterator(); while (iter.hasNext()) { Feature f = iter.next(); if (geneOverlapToRemove.contains(f)) { iter.remove(); countRemovedFeatures++; } } // logger.info("GENE " + geneKey + " has " + features.size() + " after removing overlapping // exons"); } logger.info("Total overlapping features actually removed: " + countRemovedFeatures); HashMap<String, LinkedList<Feature>> chromosomeFeaturesUnsorted = new HashMap<>(); logger.info("Merging features across each gene and inserting into separate chromosome..."); // Merge features per gene and store for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) { if (entry.getValue().size() == 0) { continue; } // GTFFeatureUtil.sortFeatures(entry.getValue()); List<Feature> mergedFeatures = FeatureMerger.mergeOverlappingFeaturesIgnoringStrand(entry.getValue()); /* * create feature counts and set ids on features */ for (Feature feature : mergedFeatures) { FeatureCount fc = new FeatureCount(feature); String id = fc.getId(); featureCounts.put(id, fc); } Feature first = mergedFeatures.get(0); String chr = first.seqname(); /* * For stranded data add + or - to end of chr */ if (chromosomeFeaturesUnsorted.containsKey(chr)) { chromosomeFeaturesUnsorted.get(chr).addAll(mergedFeatures); } else { LinkedList<Feature> chrFeatures = new LinkedList<>(); chrFeatures.addAll(mergedFeatures); chromosomeFeaturesUnsorted.put(chr, chrFeatures); } } logger.info("Sorting features per chromosome..."); // Sort features per chromosome for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { GTFFeatureUtil.sortFeatures(entry.getValue()); } logger.info("Validating that there are no overlapping intervals"); for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { logger.info("Validating that " + entry.getKey() + " has no overlap"); if (!GTFFeatureUtil.hasNoOverlapIgnoreStrand(entry.getValue())) { throw new IllegalStateException( "Some intervals have overlap... There was a problem in the merging process"); } } logger.info("Converting chomosome features to array for binary search..."); // Store as array per chromosome for binary search for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { Feature[] featureArray = new Feature[entry.getValue().size()]; chromosomeFeatures.put(entry.getKey(), entry.getValue().toArray(featureArray)); } }