/** * Handle UTR joins * * @param feature * @param key * @param qualifiers * @return */ private Location joinUtrs(Feature feature, Key key, QualifierVector qualifiers) { Location location = feature.getLocation(); if (key.getKeyString().equals("5'UTR") || key.getKeyString().equals("3'UTR")) { ChadoCanonicalGene gene = ((GFFStreamFeature) feature).getChadoGene(); String utrName = GeneUtils.getUniqueName(feature); String transcriptName = gene.getTranscriptFromName(utrName); List<Feature> utrs; if (key.getKeyString().equals("5'UTR")) utrs = gene.get5UtrOfTranscript(transcriptName); else utrs = gene.get3UtrOfTranscript(transcriptName); if (utrs.size() > 1) { int start = Integer.MAX_VALUE; RangeVector ranges = new RangeVector(); for (int i = 0; i < utrs.size(); i++) { Feature utr = utrs.get(i); Range range = utr.getLocation().getTotalRange(); if (start > range.getStart()) start = range.getStart(); ranges.add(range); } if (start != feature.getLocation().getTotalRange().getStart()) return null; location = new Location(ranges, feature.getLocation().isComplement()); } int ntranscripts = gene.getTranscripts().size(); if (ntranscripts == 1) transcriptName = gene.getGeneUniqueName(); qualifiers.setQualifier(new Qualifier("locus_tag", transcriptName)); qualifiers.removeQualifierByName("ID"); } return location; }
/** * Add a new qualifier to a list of qualifiers * * @param qualifiers * @param newQualifier */ private void addNewQualifier(QualifierVector qualifiers, Qualifier newQualifier) { Qualifier qualifier; if ((qualifier = qualifiers.getQualifierByName(newQualifier.getName())) != null) { final StringVector newValues = newQualifier.getValues(); final StringVector values = qualifier.getValues(); if (newValues == null) return; for (int j = 0; j < newValues.size(); j++) { String newValue = (String) newValues.get(j); if (!values.contains(newValue)) qualifier.addValue(newValue); } } else qualifiers.addElement(newQualifier); }
/** * Maps database (SO) keys to EMBl keys. It will add any extra qualifiers found in the 3rd column * of DATABASE_MAP_KEYS. * * @param key * @return */ private Key map(final Key key, final QualifierVector qualifiers) { if (DATABASE_MAP_KEYS == null) initDatabaseMappings(); for (int i = 0; i < DATABASE_MAP_KEYS.length; i++) { if (key.getKeyString().equals(DATABASE_MAP_KEYS[i][0])) { Key mappedKey = new Key((String) DATABASE_MAP_KEYS[i][1]); if (DATABASE_MAP_KEYS[i][2] != null) { Qualifier newQualifier = (Qualifier) DATABASE_MAP_KEYS[i][2]; if (!getEntryInformation().isValidQualifier(mappedKey, newQualifier.getName())) { try { final int nvalues; if (newQualifier.getValues() == null) nvalues = 0; else nvalues = newQualifier.getValues().size(); final int type; if (nvalues == 0) type = QualifierInfo.NO_VALUE; else type = QualifierInfo.QUOTED_TEXT; getEntryInformation() .addQualifierInfo( new QualifierInfo(newQualifier.getName(), type, null, null, false)); } catch (QualifierInfoException e) { } } qualifiers.addQualifierValues(newQualifier); } return mappedKey; } } return key; }
/** * Read some embl feature qualifiers from a stream into a QualifierVector object. The stream * should contain qualifiers in this form: * * <PRE> /name1=value1/name2="value2"/name3=[value3] </PRE> * * @param in_stream the qualifiers are read from this stream * @exception IOException thrown if there is a problem reading the qualifiers, such as end of * file. * @exception QualifierParseException Thrown if the format of the value String is not appropriate * for a Qualifier with the given name. Each qualifier has a specific format for the value * part which depends on the name, for example the value part of /codon_start qualifier must * be a number: 1, 2 or 3. * @return A Vector containing one Qualifier object for each name/value pair read from the stream. */ public static QualifierVector readQualifiers( final Reader in_stream, final EntryInformation entry_information) throws QualifierParseException, IOException { QualifierVector return_vector = new QualifierVector(); BufferedReader buffered_reader = new BufferedReader(in_stream); String name; String value; // loop until end of file while ((name = StreamQualifier.readName(buffered_reader)) != null) { // save one character in case the next char is not a '=' buffered_reader.mark(1); final int next_char = buffered_reader.read(); if (next_char == -1) value = null; else { if (next_char == '=') value = StreamQualifier.readValue(buffered_reader); else { // this qualifier doesn't have a value value = null; buffered_reader.reset(); } } final Qualifier new_qualifier; if (value == null) new_qualifier = new Qualifier(name); else { new_qualifier = StreamQualifier.makeStreamQualifier(name, value, entry_information); } return_vector.addQualifierValues(new_qualifier); } return return_vector; }
/** * Change the name of a qualifier * * @param qualifiers * @param oldName * @param newName */ private void changeQualifierName( QualifierVector qualifiers, final String oldName, final String newName) { QualifierVector tmpQualifiers = new QualifierVector(); for (int i = 0; i < qualifiers.size(); i++) { Qualifier qualifier = (Qualifier) qualifiers.elementAt(i); if (!qualifier.getName().equals(oldName)) { tmpQualifiers.addElement(qualifier); continue; } Qualifier newQualifier = new Qualifier(newName, qualifier.getValues()); tmpQualifiers.addQualifierValues(newQualifier); } qualifiers.removeAllElements(); for (int i = 0; i < tmpQualifiers.size(); i++) qualifiers.addElement(tmpQualifiers.elementAt(i)); }
/** * Routine to combine transcript qualifiers and for multiple transcripts create links to the other * transcripts (other_transcript) and to use the transcript ID. * * @param qualifiers * @param transcript * @param ntranscripts * @param chadoGene */ private int handleTranscripts( QualifierVector qualifiers, Feature transcript, int ntranscripts, ChadoCanonicalGene chadoGene) { QualifierVector transcriptQualifiers = transcript.getQualifiers().copy(); combineQualifiers(qualifiers, transcriptQualifiers, false); ntranscripts = chadoGene.getTranscripts().size(); if (ntranscripts > 1) { addNewQualifier(qualifiers, transcriptQualifiers.getQualifierByName("ID")); List<Feature> transcripts = chadoGene.getTranscripts(); for (int i = 0; i < ntranscripts; i++) { Feature thisTranscript = (Feature) transcripts.get(i); String thisTranscriptName = GeneUtils.getUniqueName(thisTranscript); if (!thisTranscriptName.equals(GeneUtils.getUniqueName(transcript))) { Qualifier qualifier = new Qualifier("other_transcript", thisTranscriptName); addNewQualifier(qualifiers, qualifier); } } } return ntranscripts; }
/** * Change the stop_codon_redefined_as_selenocysteine SO qualifier to the transl_except EMBL * qualifier. * * @param qualifiers * @param feature */ private void handleSelenocysteine(QualifierVector qualifiers, Feature feature) { if (!feature.getKey().getKeyString().equals(DatabaseDocument.EXONMODEL)) return; qualifiers.removeQualifierByName("stop_codon_redefined_as_selenocysteine"); uk.ac.sanger.artemis.Feature f = ((uk.ac.sanger.artemis.Feature) feature.getUserData()); int translatedBasePosion = 0; String aa = f.getTranslation().toString(); for (int i = 0; i < aa.length(); i++) { if (AminoAcidSequence.isStopCodon(aa.charAt(i))) { translatedBasePosion = i * 3; break; } } FeatureSegmentVector segments = f.getSegments(); int nbases = 0; int sequenceloc = 0; for (int i = 0; i < segments.size(); i++) { int seglen = segments.elementAt(i).getBases().length(); if (nbases + seglen > translatedBasePosion) { Bases bases = f.getStrand().getBases(); sequenceloc = segments.elementAt(i).getStart().getPosition() + (translatedBasePosion - nbases); if (!f.isForwardFeature()) sequenceloc = bases.getComplementPosition(sequenceloc); } nbases += seglen; } String pos = ""; if (f.isForwardFeature()) pos = sequenceloc + ".." + (sequenceloc + 2); else pos = "complement(" + (sequenceloc - 2) + ".." + sequenceloc + ")"; qualifiers.add(new Qualifier("transl_except", "(pos:" + pos + ",aa:Sec)")); }
/** * Merge qualifiers * * @param qualifiers * @param newQualifiers */ private void combineQualifiers( final QualifierVector qualifiers, final QualifierVector newQualifiers, final boolean isGene) { for (int i = 0; i < newQualifiers.size(); i++) { Qualifier newQualifier = (Qualifier) newQualifiers.get(i); if (newQualifier.getName().equals("ID") && !isGene) { continue; } // convert GO evidence to codes (e.g. ND=No biological Data available) if (newQualifier.getName().equals("GO")) { final StringVector newValues = newQualifier.getValues(); final StringVector tmpNewValues = new StringVector(); for (int j = 0; j < newValues.size(); j++) { String val = GoBox.getEvidenceCodeGoTextFromText((String) newValues.get(j)); tmpNewValues.add(val); } newQualifier = new Qualifier("GO", tmpNewValues); } if (newQualifier.getName().equals("product")) { final StringVector newValues = newQualifier.getValues(); final StringVector tmpNewValues = new StringVector(); for (int j = 0; j < newValues.size(); j++) { String val = (String) newValues.get(j); int ind = 0; if ((ind = val.indexOf(";db_xref=")) > -1) val = val.substring(0, ind); if ((ind = val.indexOf(";evidence=")) > -1) val = val.substring(0, ind); if (val.startsWith("term=")) val = val.substring(5, val.length()); if (val.endsWith(";")) val = val.substring(0, val.length() - 1); tmpNewValues.add(val); } newQualifier = new Qualifier("product", tmpNewValues); } if (newQualifier.getName().equals("orthologous_to") || newQualifier.getName().equals("paralogous_to")) { final StringVector newValues = newQualifier.getValues(); final StringVector tmpNewValues = new StringVector(); for (int j = 0; j < newValues.size(); j++) { if (!newValues.get(j).equals("")) tmpNewValues.add(newValues.get(j)); } if (tmpNewValues.size() == 0) continue; Pattern p = Pattern.compile("\\w+:link=\\w+"); for (int j = 0; j < tmpNewValues.size(); j++) { String valueStr = (String) tmpNewValues.get(j); String newValueStr; int indexEnd = valueStr.indexOf(';'); String endStr = ""; if (indexEnd > -1) endStr = valueStr.substring(indexEnd); Matcher m = p.matcher(valueStr); while (m.find()) { int index = valueStr.indexOf("link=", m.start()); newValueStr = valueStr.substring(m.start(), index) + valueStr.substring(index + 5, m.end()) + endStr; if (newQualifier.getName().equals("orthologous_to")) newQualifier = new Qualifier("orthologous_to", newValueStr); else newQualifier = new Qualifier("paralogous_to", newValueStr); qualifiers.addElement(newQualifier); } } continue; } addNewQualifier(qualifiers, newQualifier); } }
/** * Map GFF features to EMBL/Genbank * * @param feature * @return */ private Object mapGffToNativeFeature(final Feature feature) { if (DATABASE_MAP_KEYS == null) initDatabaseMappings(); Key key = feature.getKey(); QualifierVector qualifiers = feature.getQualifiers().copy(); // ignore if obsolete if (IGNORE_OBSOLETE_FEATURES) { Qualifier isObsoleteQualifier = qualifiers.getQualifierByName("isObsolete"); if (isObsoleteQualifier != null) { String value = (String) isObsoleteQualifier.getValues().get(0); if (Boolean.parseBoolean(value)) return null; } } key = map(key, qualifiers); if (getEntryInformation().isValidQualifier((String) DATABASE_QUALIFIERS_TO_REMOVE[0])) { try { if (this instanceof EmblDocumentEntry) return new EmblStreamFeature(key, feature.getLocation(), qualifiers); else return new GenbankStreamFeature(key, feature.getLocation(), qualifiers); } catch (InvalidRelationException e) { // TODO Auto-generated catch block e.printStackTrace(); } } Location location = joinUtrs(feature, key, qualifiers); if (location == null) return null; // flatten gene model - combining qualifiers if (key.getKeyString().equals(DatabaseDocument.EXONMODEL)) { ChadoCanonicalGene chadoGene = ((GFFStreamFeature) feature).getChadoGene(); final String name = GeneUtils.getUniqueName(feature); final String transcriptName = chadoGene.getTranscriptFromName(name); StringVector sv = new StringVector(); sv.add(transcriptName); final Feature transcript = chadoGene.containsTranscript(sv); if (transcript != null && GeneUtils.isNonCodingTranscripts(transcript.getKey())) return null; qualifiers.removeQualifierByName("ID"); int ntranscripts = 0; // add transcript & protein qualifiers to CDS try { final Feature protein = chadoGene.getProteinOfTranscript(transcriptName); if (protein != null) combineQualifiers(qualifiers, protein.getQualifiers().copy(), false); if (transcript != null) ntranscripts = handleTranscripts(qualifiers, transcript, ntranscripts, chadoGene); } catch (NullPointerException npe) { } // add gene qualifiers to CDS QualifierVector geneQualifiers = chadoGene.getGene().getQualifiers().copy(); // multiple transcripts if (ntranscripts > 1 && geneQualifiers.getQualifierByName("ID") != null) { Qualifier newIDQualifier = new Qualifier( "shared_id", (String) geneQualifiers.getQualifierByName("ID").getValues().get(0)); addNewQualifier(qualifiers, newIDQualifier); geneQualifiers.removeQualifierByName("ID"); } combineQualifiers(qualifiers, geneQualifiers, true); } else if (GeneUtils.isNonCodingTranscripts(key)) { // use gene id for non-coding transcripts ChadoCanonicalGene chadoGene = ((GFFStreamFeature) feature).getChadoGene(); if (chadoGene != null) { qualifiers.removeQualifierByName("ID"); QualifierVector geneQualifiers = chadoGene.getGene().getQualifiers().copy(); combineQualifiers(qualifiers, geneQualifiers, true); } } try { for (int i = 0; i < DATABASE_QUALIFIERS_TO_MAP.length; i++) { if (!getEntryInformation().isValidQualifier(DATABASE_QUALIFIERS_TO_MAP[i][0])) { changeQualifierName( qualifiers, DATABASE_QUALIFIERS_TO_MAP[i][0], DATABASE_QUALIFIERS_TO_MAP[i][1]); } } if (qualifiers.getQualifierByName("stop_codon_redefined_as_selenocysteine") != null) { handleSelenocysteine(qualifiers, feature); } for (int i = 0; i < DATABASE_QUALIFIERS_TO_REMOVE.length; i++) { if (!getEntryInformation().isValidQualifier((String) DATABASE_QUALIFIERS_TO_REMOVE[i])) qualifiers.removeQualifierByName((String) DATABASE_QUALIFIERS_TO_REMOVE[i]); } if (key.getKeyString().equals("polypeptide")) return null; else if (key.getKeyString().equals("gene")) return null; else if (key.getKeyString().equals("centromere")) return null; else if (key.getKeyString().equals("transcript") || key.getKeyString().equals("mRNA")) return null; if (this instanceof EmblDocumentEntry) return new EmblStreamFeature(key, location, qualifiers); else return new GenbankStreamFeature(key, location, qualifiers); } catch (InvalidRelationException e) { e.printStackTrace(); if (feature instanceof DatabaseStreamFeature) return new EmblStreamFeature(); else return new GenbankStreamFeature(); } }