/** * Retrieve the weight for this distribution. * * <p>Performs the standard munge to handle ambiguity symbols. The actual weights for each atomic * symbol should be calculated by the getWeightImpl functions. * * @param sym the Symbol to find the probability of * @return the probability that one of the symbols matching amb was emitted * @throws IllegalSymbolException if for any reason the symbols within amb are not recognized by * this state */ public final double getWeight(Symbol sym) throws IllegalSymbolException { if (sym instanceof AtomicSymbol) { return getWeightImpl((AtomicSymbol) sym); } else { Alphabet ambA = sym.getMatches(); if (((FiniteAlphabet) ambA).size() == 0) { // a gap getAlphabet().validate(sym); double totalWeight = 0.0; for (Iterator i = ((FiniteAlphabet) getAlphabet()).iterator(); i.hasNext(); ) { Symbol s = (Symbol) i.next(); totalWeight += getWeight(s); } return 1.0 - totalWeight; } if (ambA instanceof FiniteAlphabet) { FiniteAlphabet fa = (FiniteAlphabet) ambA; double sum = 0.0; for (Iterator i = fa.iterator(); i.hasNext(); ) { Object obj = i.next(); if (!(obj instanceof AtomicSymbol)) { throw new BioError("Assertion Failure: Not an instance of AtomicSymbol: " + obj); } AtomicSymbol as = (AtomicSymbol) obj; sum += getWeightImpl(as); } return sum; } else { throw new IllegalSymbolException( "Can't find weight for infinite set of symbols matched by " + sym.getName()); } } }
private void doSetWeight(Symbol sym, double weight) throws IllegalSymbolException, ChangeVetoException { if (sym instanceof AtomicSymbol) { setWeightImpl((AtomicSymbol) sym, weight); } else { // need to divide the weight up amongst the atomic symbols according // to the null model FiniteAlphabet fa = (FiniteAlphabet) sym.getMatches(); double totalNullWeight = this.getNullModel().getWeight(sym); for (Iterator si = fa.iterator(); si.hasNext(); ) { AtomicSymbol as = (AtomicSymbol) si.next(); double symNullWeight = this.getNullModel().getWeight(as); setWeightImpl(as, weight * symNullWeight / totalNullWeight); } } }
/** * <code>makeSubHit</code> creates a new sub-hit. * * @return a <code>SeqSimilaritySearchSubHit</code>. * @exception BioException if an error occurs. */ private SeqSimilaritySearchSubHit makeSubHit() throws BioException { // Try to get a valid TokenParser if (tokenParser == null) { String identifier; // Try explicit sequence type first if (subHitData.containsKey("subjectSequenceType")) identifier = (String) subHitData.get("subjectSequenceType"); // Otherwise try to resolve from the program name (only // works for Blast) else if (resultPreAnnotation.containsKey("program")) identifier = (String) resultPreAnnotation.get("program"); else throw new BioException("Failed to determine sequence type"); FiniteAlphabet alpha = AlphabetResolver.resolveAlphabet(identifier); tokenParser = alpha.getTokenization("token"); } // BLASTP output has the strands set null (protein sequences) Strand qStrand = null; Strand sStrand = null; // Override where an explicit strand is given (FASTA DNA, // BLASTN) if (subHitData.containsKey("queryStrand")) if (subHitData.get("queryStrand").equals("plus")) qStrand = StrandedFeature.POSITIVE; else qStrand = StrandedFeature.NEGATIVE; if (subHitData.containsKey("subjectStrand")) if (subHitData.get("subjectStrand").equals("plus")) sStrand = StrandedFeature.POSITIVE; else sStrand = StrandedFeature.NEGATIVE; // Override where a frame is given as this contains strand // information (BLASTX for query, TBLASTN for hit, TBLASTX for // both) if (subHitData.containsKey("queryFrame")) if (((String) subHitData.get("queryFrame")).startsWith("plus")) qStrand = StrandedFeature.POSITIVE; else qStrand = StrandedFeature.NEGATIVE; if (subHitData.containsKey("subjectFrame")) if (((String) subHitData.get("subjectFrame")).startsWith("plus")) sStrand = StrandedFeature.POSITIVE; else sStrand = StrandedFeature.NEGATIVE; // Get start/end int qStart = Integer.parseInt((String) subHitData.get("querySequenceStart")); int qEnd = Integer.parseInt((String) subHitData.get("querySequenceEnd")); int sStart = Integer.parseInt((String) subHitData.get("subjectSequenceStart")); int sEnd = Integer.parseInt((String) subHitData.get("subjectSequenceEnd")); // The start/end coordinates from BioJava XML don't follow the // BioJava paradigm of start < end, with orientation given by // the strand property. Rather, they present start/end as // displayed in BLAST output, with the coordinates being // inverted on the reverse strand. We account for this here. if (qStrand == StrandedFeature.NEGATIVE) { int swap = qStart; qStart = qEnd; qEnd = swap; } if (sStrand == StrandedFeature.NEGATIVE) { int swap = sStart; sStart = sEnd; sEnd = swap; } // Get scores double sc = Double.NaN; double ev = Double.NaN; double pv = Double.NaN; if (subHitData.containsKey("score")) sc = Double.parseDouble((String) subHitData.get("score")); if (subHitData.containsKey("expectValue")) { String val = (String) subHitData.get("expectValue"); // Blast sometimes uses invalid formatting such as 'e-156' // rather than '1e-156' if (val.startsWith("e")) ev = Double.parseDouble("1" + val); else ev = Double.parseDouble(val); } if (subHitData.containsKey("pValue")) pv = Double.parseDouble((String) subHitData.get("pValue")); Map labelMap = new SmallMap(); // Note that the following is removing the raw sequences StringBuffer tokenBuffer = new StringBuffer(1024); tokenBuffer.append((String) subHitData.remove("querySequence")); labelMap.put( SeqSimilaritySearchSubHit.QUERY_LABEL, new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); tokenBuffer = new StringBuffer(1024); tokenBuffer.append((String) subHitData.remove("subjectSequence")); labelMap.put( hitData.get("subjectId"), new SimpleSymbolList(tokenParser, tokenBuffer.substring(0))); return new SimpleSeqSimilaritySearchSubHit( sc, ev, pv, qStart, qEnd, qStrand, sStart, sEnd, sStrand, new SimpleAlignment(labelMap), AnnotationFactory.makeAnnotation(subHitData)); }