@Override public boolean run() { for (Gene g : config.getSnpEffectPredictor().getGenome().getGenes()) { // System.out.println(g.getGeneName()); for (Transcript tr : g) { if (!tr.isProteinCoding()) continue; if (tr.introns().size() < 2) continue; // System.out.println("\t" + tr.getId()); for (Intron i : tr.introns()) { int pos = i.getStart() + (int) (Math.random() * (i.size() - 2)) + 1; String line = i.getChromosomeName() + "\t" + pos + "\t.\tA\tT\t.\t.\tAC=1;GENE=" + g.getGeneName() + ";TR=" + tr.getId() + ";INTRON=" + i.getRank(); System.out.println(line); sb.append(line + "\n"); } } } Gpr.toFile(Gpr.HOME + "/introns_test.vcf", sb); return true; }
/** Show a genome in a 'standard' way */ String show(Genome genome) { StringBuilder sb = new StringBuilder(); // Genome sb.append(genome.getVersion() + "\n"); // Chromosomes for (Chromosome chr : genome) sb.append(chr + "\n"); // Genes ArrayList<Gene> genes = new ArrayList<Gene>(); // Sort genes for (Gene gene : genome.getGenes()) genes.add(gene); Collections.sort(genes); // Show genes for (Gene gene : genes) { sb.append(gene); for (Transcript tr : gene.sortedStrand()) sb.append("\t\tCDS '" + tr.getId() + "': " + tr.cds() + "\n"); } return sb.toString(); }
/** * Add into to a hash * * @param hits * @param marker * @param hit2add * @param showGeneDetails * @param compareTemplate */ void regionsAddHit( HashSet<String> hits, Marker hit2add, Marker marker, boolean showGeneDetails, boolean compareTemplate) { String hitStr = hit2add.getClass().getSimpleName(); if (compareTemplate) { Gene gene = (Gene) hit2add.findParent(Gene.class); if (gene != null) hitStr += (hit2add.isStrandPlus() == marker.isStrandPlus()) ? "_TEMPLATE_STRAND" : "_NON_TEMPLATE_STRAND"; } if (showGeneDetails && (hit2add instanceof Gene)) { Gene gene = (Gene) hit2add; hitStr += "[" + gene.getBioType() + ", " + gene.getGeneName() + ", " + (gene.isProteinCoding() ? "protein" : "not-protein") + "]"; } hits.add(hitStr); // Add marker name to the list }
/** * Return a collection of intervals that intersect 'marker' Query resulting genes, transcripts and * exons to get ALL types of intervals possible * * @return */ public Markers queryDeep(Marker marker) { if (Config.get().isErrorOnMissingChromo() && isChromosomeMissing(marker)) throw new RuntimeEOFException("Chromosome missing for marker: " + marker); boolean hitChromo = false; Markers hits = new Markers(); Markers intersects = query(marker); if (intersects.size() > 0) { for (Marker m : intersects) { hits.add(m); if (m instanceof Chromosome) { hitChromo = true; // OK (we have to hit a chromosome, otherwise it's an error } else if (m instanceof Gene) { // Analyze Genes Gene gene = (Gene) m; hits.addAll(gene.query(marker)); } } } if (!hitChromo && Config.get().isErrorChromoHit()) throw new RuntimeException("ERROR: Out of chromosome range. " + marker); return hits; }
/** Get biotype */ public String getBiotype() { Gene gene = getGene(); if (gene == null) return ""; Transcript tr = getTranscript(); if (tr != null) return tr.getBioType(); else if (gene.getGenome().hasCodingInfo()) return (gene.isProteinCoding() ? "coding" : "non-coding"); return ""; }
/** Count bases covered for each marker type */ public void countBases() { // --- // Add all markers // --- Markers markers = new Markers(); markers.add(snpEffectPredictor.getMarkers()); for (Gene gene : snpEffectPredictor.getGenome().getGenes()) { markers.add(gene); markers.add(gene.markers()); } for (Chromosome chr : snpEffectPredictor.getGenome()) markers.add(chr); // --- // Calculate raw counts // --- for (Marker m : markers) { String mtype = markerTypes.getType(m); String msubtype = markerTypes.getSubType(m); rawCountMarkers.inc(mtype); rawCountBases.inc(mtype, m.size()); // Count sub-types (if any) if (msubtype != null) { rawCountMarkers.inc(msubtype); rawCountBases.inc(msubtype, m.size()); } } // --- // Count number of bases for each marker type (overlap and join) // --- for (String mtype : rawCountMarkers.keysSorted()) { if (mtype.equals(Chromosome.class.getSimpleName())) continue; // We calculate chromosomes later (it's faster) if (verbose) System.err.print(mtype + ":"); if (countMarkers.get(mtype) == 0) { for (Chromosome chr : snpEffectPredictor.getGenome()) countBases(mtype, chr, markers); } if (verbose) System.err.println(""); } // Show chromosomes length String mtype = Chromosome.class.getSimpleName(); for (Chromosome chr : snpEffectPredictor.getGenome()) { countBases.inc(mtype, chr.size()); countMarkers.inc(mtype); } }
public void testCase_05_PaeruPA14muccA() { String genome = "paeru.PA14"; String gff3File = "tests/paeru.PA14.muccA.gff"; String resultFile = "tests/paeru.PA14.muccA.txt"; SnpEffectPredictor sep = buildAndCompare(genome, gff3File, resultFile, true, false); // Make sure no splice site is added Gene gene = sep.getGenome().getGenes().iterator().next(); Transcript tr = gene.iterator().next(); List<SpliceSite> spliceSites = tr.createSpliceSites(SpliceSite.CORE_SPLICE_SITE_SIZE, 0, 0, 0); Assert.assertEquals(0, spliceSites.size()); }
/** * Find closest gene to this marker * * <p>In case more than one 'closest' gene is found (e.g. two or more genes at the same distance). * The following rules apply: * * <p>i) If many genes have the same 'closest distance', coding genes are preferred. * * <p>ii) If more than one coding gene has the same 'closet distance', a random gene is returned. * * @param inputInterval */ public Gene queryClosestGene(Marker inputInterval) { int initialExtension = 1000; String chrName = inputInterval.getChromosomeName(); Chromosome chr = genome.getChromosome(chrName); if (chr == null) return null; if (chr.size() > 0) { // Extend interval to capture 'close' genes for (int extend = initialExtension; extend < chr.size(); extend *= 2) { int start = Math.max(inputInterval.getStart() - extend, 0); int end = inputInterval.getEnd() + extend; Marker extended = new Marker(chr, start, end, 1, ""); // Find all genes that intersect with the interval Markers markers = query(extended); Markers genes = new Markers(); int minDist = Integer.MAX_VALUE; for (Marker m : markers) { if (m instanceof Gene) { int dist = m.distance(inputInterval); if (dist < minDist) { genes.add(m); minDist = dist; } } } // Found something? if (genes.size() > 0) { // Find a gene having distance 'minDist'. Prefer coding genes Gene minDistGene = null; for (Marker m : genes) { int dist = m.distance(inputInterval); if (dist == minDist) { Gene gene = (Gene) m; if (minDistGene == null) minDistGene = gene; else if (!minDistGene.isProteinCoding() && gene.isProteinCoding()) minDistGene = gene; } } return minDistGene; } } } // Nothing found return null; }
/** * What percentile of the transcripts in this gene are affected? * * @param gene * @return */ double percentOfTranscriptsAffected(Gene gene, HashSet<Transcript> transcripts) { if (gene == null) return 0; // Count how many transcript are affected in each gene int countAffected = 0; for (Transcript tr : gene) if (transcripts.contains(tr)) countAffected++; return countAffected / ((double) gene.numChilds()); }
public String toString(boolean useSeqOntology, boolean useHgvs) { // Get data to show String geneId = "", geneName = "", bioType = "", transcriptId = "", exonId = "", customId = ""; int exonRank = -1; if (marker != null) { // Gene Id, name and biotype Gene gene = getGene(); Transcript tr = getTranscript(); // CDS size info if (gene != null) { geneId = gene.getId(); geneName = gene.getGeneName(); bioType = getBiotype(); } // Update trId if (tr != null) transcriptId = tr.getId(); // Exon rank information Exon exon = getExon(); if (exon != null) { exonId = exon.getId(); exonRank = exon.getRank(); } // Regulation if (isRegulation()) bioType = ((Regulation) marker).getCellType(); } // Add seqChage's ID if (!variant.getId().isEmpty()) customId += variant.getId(); // Add custom markers if ((marker != null) && (marker instanceof Custom)) customId += (customId.isEmpty() ? "" : ";") + marker.getId(); // CDS length int cdsSize = getCdsLength(); String errWarn = error + (error.isEmpty() ? "" : "|") + warning; String aaChange = ""; if (useHgvs) aaChange = getHgvs(); else aaChange = ((aaRef.length() + aaAlt.length()) > 0 ? aaRef + "/" + aaAlt : ""); return errWarn // + "\t" + geneId // + "\t" + geneName // + "\t" + bioType // + "\t" + transcriptId // + "\t" + exonId // + "\t" + (exonRank >= 0 ? exonRank : "") // + "\t" + effect(false, false, false, useSeqOntology) // + "\t" + aaChange // + "\t" + ((codonsRef.length() + codonsAlt.length()) > 0 ? codonsRef + "/" + codonsAlt : "") // + "\t" + (codonNum >= 0 ? (codonNum + 1) : "") // + "\t" + (codonDegeneracy >= 0 ? codonDegeneracy + "" : "") // + "\t" + (cdsSize >= 0 ? cdsSize : "") // + "\t" + (codonsAroundOld.length() > 0 ? codonsAroundOld + " / " + codonsAroundNew : "") // + "\t" + (aasAroundOld.length() > 0 ? aasAroundOld + " / " + aasAroundNew : "") // + "\t" + customId // ; }
/** * Is this single change a LOF? * * <p>Criteria: 1) Core splice sites acceptors or donors (only CORE ones) 2) Stop gained (if this * happens at the last part of the protein, we assume it has no effect) 3) Frame shifts * * @param changeEffect * @return */ protected boolean isLof(ChangeEffect changeEffect) { // Not a sequence change? => Not LOF if ((changeEffect.getSeqChange() != null) && (!changeEffect.getSeqChange().isVariant())) return false; // Is this change affecting a protein coding gene? Gene gene = changeEffect.getGene(); Transcript tr = changeEffect.getTranscript(); if ((gene == null) // No gene affected? || (tr == null) // No transcript affected? || (!gene.isProteinCoding() && !config.isTreatAllAsProteinCoding()) // Not a protein coding gene? || (!tr.isProteinCoding() && !config.isTreatAllAsProteinCoding()) // Not a protein coding transcript? ) return false; // --- // Is this variant a LOF? // --- boolean lof = false; // Frame shifts if (changeEffect.getEffectType() == EffectType.FRAME_SHIFT) { // It is assumed that even with a protein coding change at the last 5% of the protein, the // protein could still be functional. double perc = percentCds(changeEffect); lof |= (ignoreProteinCodingBefore <= perc) && (perc <= ignoreProteinCodingAfter); } // Deletion? Is another method to check if (changeEffect.getSeqChange().isDel()) lof |= isLofDeletion(changeEffect); // The following effect types can be considered LOF switch (changeEffect.getEffectType()) { case SPLICE_SITE_ACCEPTOR: case SPLICE_SITE_DONOR: // Core splice sites are considered LOF if ((changeEffect.getMarker() != null) && (changeEffect.getMarker() instanceof SpliceSite)) { // Get splice site marker and check if it is 'core' SpliceSite spliceSite = (SpliceSite) changeEffect.getMarker(); if (spliceSite.intersectsCoreSpliceSite(changeEffect.getSeqChange())) lof = true; // Does it intersect the CORE splice site? } break; case STOP_GAINED: lof |= isNmd(changeEffect); break; case RARE_AMINO_ACID: case START_LOST: // This one is not in the referenced papers, but we assume that RARE AA and START_LOSS // changes are damaging. lof = true; break; default: // All others are not considered LOF break; } // Update sets if (lof) { transcriptsLof.add( changeEffect .getTranscript()); // Unique transcripts affected (WARNING: null will be added) genesLof.add(changeEffect.getGene()); // Unique genes affected (WARNING: null will be added) } return lof; }
/** * Remove all transcripts that are NOT in the list * * @return : Number of transcripts removed */ public int retainAllTranscripts(Set<String> trIds) { int total = 0; for (Gene g : genome.getGenes()) total += g.keepTranscripts(trIds); return total; }
/** Remove all non-canonical transcripts */ public void removeNonCanonical() { for (Gene g : genome.getGenes()) g.removeNonCanonical(); }
/** * Name of the regions hit by a marker * * @param marker * @param showGeneDetails * @param compareTemplate * @param id : Only use genes or transcripts matching this ID * @return */ public Set<String> regions( Marker marker, boolean showGeneDetails, boolean compareTemplate, String id) { if (Config.get().isErrorOnMissingChromo() && isChromosomeMissing(marker)) throw new RuntimeEOFException("Chromosome missing for marker: " + marker); boolean hitChromo = false; HashSet<String> hits = new HashSet<String>(); Markers intersects = query(marker); if (intersects.size() > 0) { for (Marker markerInt : intersects) { if (markerInt instanceof Chromosome) { hitChromo = true; // OK (we have to hit a chromosome, otherwise it's an error hits.add(markerInt.getClass().getSimpleName()); // Add marker name to the list } else if (markerInt instanceof Gene) { // Analyze Genes Gene gene = (Gene) markerInt; regionsAddHit(hits, gene, marker, showGeneDetails, compareTemplate); // For all transcripts... for (Transcript tr : gene) { if ((id == null) || gene.getId().equals(id) || tr.getId().equals(id)) { // Mathes ID? (...or no ID to match) // Does it intersect this transcript? if (tr.intersects(marker)) { regionsAddHit(hits, tr, marker, showGeneDetails, compareTemplate); // Does it intersect a UTR? for (Utr utr : tr.getUtrs()) if (utr.intersects(marker)) regionsAddHit(hits, utr, marker, showGeneDetails, compareTemplate); // Does it intersect an exon? for (Exon ex : tr) if (ex.intersects(marker)) regionsAddHit(hits, ex, marker, showGeneDetails, compareTemplate); // Does it intersect an intron? for (Intron intron : tr.introns()) if (intron.intersects(marker)) regionsAddHit(hits, intron, marker, showGeneDetails, compareTemplate); } } } } else { // No ID to match? if (id == null) regionsAddHit(hits, markerInt, marker, showGeneDetails, compareTemplate); else { // Is ID from transcript? Transcript tr = (Transcript) markerInt.findParent(Transcript.class); if ((tr != null) && (tr.getId().equals(id))) { regionsAddHit( hits, markerInt, marker, showGeneDetails, compareTemplate); // Transcript ID matches => count } else { // Is ID from gene? Gene gene = (Gene) markerInt.findParent(Gene.class); if ((gene != null) && (gene.getId().equals(id))) regionsAddHit( hits, markerInt, marker, showGeneDetails, compareTemplate); // Gene ID matches => count } } } } } if (!hitChromo) throw new RuntimeException("ERROR: Out of chromosome range. " + marker); return hits; }