static Map<Feature, Integer> getMappedRegionsForMate( SAMRecord mate, Feature[] features, SAMRecordToFeatureConverter converter, FindOverlappingFeatures findOverlappingFeatures) { List<Feature> featuresForRecord = converter.convert(mate); HashMap<Feature, Integer> featuresForMate = new HashMap<>(); // boolean isAmbiguous = false; for (Feature target : featuresForRecord) { // logger.info(GTFFeatureRenderer.render(target)); // we can assume the result is non overlapping List<Feature> mappedFeatures = findOverlappingFeatures.findOverlappingFeatures(features, target); // logger.info("Number of mapped features: " + mappedFeatures.size()); assert (GTFFeatureUtil.hasNoOverlapIgnoreStrand(mappedFeatures)); for (Feature mapped : mappedFeatures) { // logger.info(GTFFeatureRenderer.render(mapped)); if (target.location().plus().overlaps(mapped.location().plus())) { int overlap = target.location().plus().intersection(mapped.location().plus()).length(); // logger.info("overlap: " + overlap); if (featuresForMate.containsKey(mapped)) { int currentVal = featuresForMate.get(mapped); featuresForMate.put(mapped, overlap + currentVal); } else { featuresForMate.put(mapped, overlap); } } } } return featuresForMate; }
@Override public void buildFeatures(File gtf, String type) throws IOException { // gene -> LinkedList<Feature> HashMap<String, LinkedList<Feature>> geneFeatures = new HashMap<>(); if (!type.equals("exon")) { throw new UnsupportedOperationException("only exon counting is supported"); } logger.info("Reading annotation gtf..."); // Read GTF // Store all exons for each gene int totalFeatures = 0; int totalGenes = 0; int linesRead = 0; try (GTFReader reader = GTFReader.getGTFByFileNoEnsemblVersion(gtf)) { reader.addFilter(new GeneOrExonFilter()).addFilter(new RemoveRetainedIntronFilter()); for (Feature feature : reader) { linesRead++; if (linesRead % 100000 == 0) { logger.info("Lines read: " + linesRead); logger.info("Last read feature: " + GTFFeatureRenderer.render(feature)); } if (feature.type().equals("gene")) { geneInfo.put(feature.getAttribute("gene_id"), feature); totalGenes++; } else if (feature.type().equals("exon")) { totalFeatures++; String geneId = feature.getAttribute("gene_id"); if (geneFeatures.containsKey(geneId)) { geneFeatures.get(geneId).add(feature); } else { LinkedList<Feature> features = new LinkedList<Feature>(); features.add(feature); geneFeatures.put(geneId, features); } } } } logger.info("Total genes " + totalGenes); logger.info("Total exons " + totalFeatures); /* * Checking GTF Validity */ logger.info("Validating that every exon has at least some gene entry from GTF file"); for (String key : geneFeatures.keySet()) { if (!geneInfo.containsKey(key)) { throw new IllegalStateException(key + " does not have a gene entry in the GTF"); } } logger.info("All gene ids found for exons"); /* * sorting exons per gene */ logger.info("Sorting features per gene..."); // Sort features per chromosome for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) { GTFFeatureUtil.sortFeatures(entry.getValue()); } /* * find overlapping features between genes */ ArrayList<String> geneKeys = new ArrayList<String>(geneFeatures.keySet().size()); geneKeys.addAll(geneFeatures.keySet()); FindOverlappingExonsBetweenGenes findOverlappingExonsBetweenGenes = new FindOverlappingExonsBetweenGenes(); logger.info("Finding overlapping exons between genes"); HashSet<Feature> geneOverlapToRemove = new HashSet<>(); FindOverlappingGenePairs findOverlappingGenePairs = new FindOverlappingGenePairs(geneInfo.values()); List<FeaturePair> overlappingGenes = findOverlappingGenePairs.getOverlappingGenes(); for (FeaturePair pair : overlappingGenes) { // logger.info("First of pair: " + pair.getFirst().getAttribute("gene_id")); // logger.info("Second of pair: " + pair.getSecond().getAttribute("gene_id")); List<Feature> gene1 = geneFeatures.get(pair.getFirst().getAttribute("gene_id")); List<Feature> gene2 = geneFeatures.get(pair.getSecond().getAttribute("gene_id")); // Some genes may have only retained introns... if (gene1 != null && gene1.size() > 0 && gene2 != null && gene2.size() > 0) { List<Feature> overlappingExons = findOverlappingExonsBetweenGenes.findOverlappingExons(gene1, gene2); if (overlappingExons.size() > 0) { geneOverlapToRemove.addAll(overlappingExons); } } } logger.info("Storing removed exons"); featuresRemovedForAnalysis = geneOverlapToRemove; logger.info("Total overlapping features between genes: " + geneOverlapToRemove.size()); /* * remove overlapping features between genes */ int countRemovedFeatures = 0; logger.info("Removing overlapping exons per gene"); for (String geneKey : geneKeys) { List<Feature> features = geneFeatures.get(geneKey); // logger.info("GENE " + geneKey + " has " + features.size()); Iterator<Feature> iter = features.iterator(); while (iter.hasNext()) { Feature f = iter.next(); if (geneOverlapToRemove.contains(f)) { iter.remove(); countRemovedFeatures++; } } // logger.info("GENE " + geneKey + " has " + features.size() + " after removing overlapping // exons"); } logger.info("Total overlapping features actually removed: " + countRemovedFeatures); HashMap<String, LinkedList<Feature>> chromosomeFeaturesUnsorted = new HashMap<>(); logger.info("Merging features across each gene and inserting into separate chromosome..."); // Merge features per gene and store for (Map.Entry<String, LinkedList<Feature>> entry : geneFeatures.entrySet()) { if (entry.getValue().size() == 0) { continue; } // GTFFeatureUtil.sortFeatures(entry.getValue()); List<Feature> mergedFeatures = FeatureMerger.mergeOverlappingFeaturesIgnoringStrand(entry.getValue()); /* * create feature counts and set ids on features */ for (Feature feature : mergedFeatures) { FeatureCount fc = new FeatureCount(feature); String id = fc.getId(); featureCounts.put(id, fc); } Feature first = mergedFeatures.get(0); String chr = first.seqname(); /* * For stranded data add + or - to end of chr */ if (chromosomeFeaturesUnsorted.containsKey(chr)) { chromosomeFeaturesUnsorted.get(chr).addAll(mergedFeatures); } else { LinkedList<Feature> chrFeatures = new LinkedList<>(); chrFeatures.addAll(mergedFeatures); chromosomeFeaturesUnsorted.put(chr, chrFeatures); } } logger.info("Sorting features per chromosome..."); // Sort features per chromosome for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { GTFFeatureUtil.sortFeatures(entry.getValue()); } logger.info("Validating that there are no overlapping intervals"); for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { logger.info("Validating that " + entry.getKey() + " has no overlap"); if (!GTFFeatureUtil.hasNoOverlapIgnoreStrand(entry.getValue())) { throw new IllegalStateException( "Some intervals have overlap... There was a problem in the merging process"); } } logger.info("Converting chomosome features to array for binary search..."); // Store as array per chromosome for binary search for (Map.Entry<String, LinkedList<Feature>> entry : chromosomeFeaturesUnsorted.entrySet()) { Feature[] featureArray = new Feature[entry.getValue().size()]; chromosomeFeatures.put(entry.getKey(), entry.getValue().toArray(featureArray)); } }