/** {@inheritDoc} */ @Override public void process(Reader reader) throws Exception { // Data has format: // id | description @SuppressWarnings("rawtypes") Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader); int count = 0; while (lineIter.hasNext()) { String[] line = (String[]) lineIter.next(); try { String entrez = line[0]; String description = line[1]; LOG.error("description " + count++ + " " + description); if (!StringUtils.isBlank(description)) { Item gene = createItem("Gene"); gene.setAttribute("primaryIdentifier", entrez); gene.setAttribute("description", description); gene.setReference("organism", getOrganism(HUMAN_TAXON_ID)); store(gene); } } catch (IndexOutOfBoundsException e) { LOG.info("Failed to read line: " + Arrays.asList(line)); } } }
protected void createFromFile(File f) throws IOException { // data is in format: // ZDBID ID1,ID2,ID3 Iterator<?> lineIter = FormattedTextParser.parseTabDelimitedReader(new BufferedReader(new FileReader(f))); while (lineIter.hasNext()) { String[] line = (String[]) lineIter.next(); if (line.length < 2 || line[0].startsWith("#") || !line[0].startsWith(GENE_PATTERN)) { continue; } String zfinId = line[0]; String[] synonyms = StringUtil.split(line[1].trim(), ","); resolver.addMainIds(taxonId, zfinId, Collections.singleton(zfinId)); resolver.addSynonyms(taxonId, zfinId, new HashSet<String>(Arrays.asList(synonyms))); } }
/** * Process Topo files and create two maps. * * @param file a topo file. * @return Map<key:primaryId/fullName, value:Map<key:symbol/symbol, value:level/position>> */ private Map<String, Map<String, String>> processTopoFile(File file) { Map<String, Map<String, String>> topoMap = new HashMap<String, Map<String, String>>(); try { Reader reader = new FileReader(file); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(reader); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length > 1) { // the file could end with an empty line if ("topos_tf.tsv".equals(file.getName())) { String symbol = line[0]; String primaryId = line[1]; String level = line[2]; Map<String, String> aMap = new HashMap<String, String>(); aMap.put(symbol, level); topoMap.put(primaryId, aMap); } else if ("topos_mirna.tsv".equals(file.getName())) { String symbol = line[0]; String fullName = line[1]; String position = line[2]; Map<String, String> aMap = new HashMap<String, String>(); aMap.put(symbol, position); topoMap.put(fullName, aMap); } } } } catch (FileNotFoundException e) { e.printStackTrace(); throw new RuntimeException(e); } return topoMap; }
public void processJournals(Reader reader) throws Exception { Iterator lineIter = FormattedTextParser.parseDelimitedReader(reader, '|'); while (lineIter.hasNext()) { String[] line = (String[]) lineIter.next(); if (line.length < 2) { throw new RuntimeException( "Journal line does not have enough elements: " + line.length + line[0]); } String primaryIdentifier = line[0]; String name = line[1]; String abbrev = line[2]; String publisher = line[3]; Item journal = getJournal(primaryIdentifier); if (!StringUtils.isEmpty(name)) { journal.setAttribute("name", name); } if (!StringUtils.isEmpty(abbrev)) { journal.setAttribute("abbrev", abbrev); } if (!StringUtils.isEmpty(publisher)) { journal.setAttribute("publisher", publisher); } if (journal.getAttribute("primaryIdentifier").getValue().equals("ZDB-JRNL-050621-1000")) { System.out.println("storing: ZDB-JRNL-050621-1000"); } try { store(journal); } catch (ObjectStoreException e) { throw new SAXException(e); } } }
/** * Process all rows of the map_title.tab file * * @param reader a reader for the map_title.tab file * @throws IOException * @throws ObjectStoreException */ private void processMapTitleFile(Reader reader) throws IOException, ObjectStoreException { Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader); // this file has data of the format: // pathway id | pathway name while (lineIter.hasNext()) { // line is a string array with the one element for each tab separated value // on the next line of the file String[] line = (String[]) lineIter.next(); String pathwayId = line[0]; String pathwayName = line[1]; // getPathway will create an Item or fetch it from a map if seen before Item pathway = getPathway(pathwayId); pathway.setAttribute("name", pathwayName); // once we have set the pathway name that is all the information needed so we can store store(pathway); } }
/** * @param reader * @throws Exception * @throws ObjectStoreException */ private void processHalfLifeDataFile(Reader preader) throws Exception, ObjectStoreException { /* * Sample line * protein half-life YMR028W 8.2 min 0.1 hr 25466257 * protein half-life YLR110C 10.0 min 0.2 hr 25466257 */ System.out.println("Processing Protien HalfLife Data file...."); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(preader); } catch (Exception e) { throw new BuildException("cannot parse file: " + getCurrentFile(), e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length < 5) { LOG.error("Couldn't process line. Expected 6 cols, but was " + line.length); continue; } String experiment = line[0].trim(); String protein = line[1].trim(); String value = line[2].trim(); String units = line[3].trim(); // String valueMinutes = line[2].trim(); // String stringMinutes = line[3].trim(); // String valueHours = line[4].trim(); // String stringHours = line[5].trim(); String pmid = line[4].trim(); newProduct(experiment, protein, value, units, pmid); } preader.close(); }
/** {@inheritDoc} */ public void process(Reader reader) throws Exception { // Create a chromosome Item chromosome = createItem("Chromosome"); chromosome.setAttribute("primaryIdentifier", CHROMOSOME_PID); store(chromosome); @SuppressWarnings("rawtypes") Iterator lineIter = FormattedTextParser.parseTabDelimitedReader(reader); while (lineIter.hasNext()) { String[] line = (String[]) lineIter.next(); // remove header line if (!line[0].equals(HEADER_LINE)) { String ecogeneId = line[0]; String geneName = line[1]; String eCK = line[2]; String swissProtId = line[3]; String wisconsinGenBankId = line[4]; String genBankProteinId = line[5]; String genoBaseId = line[6]; String type = line[7]; String strand = line[8]; String start = line[9]; String end = line[10]; String synonym = line[11]; Set<String> symSet = new TreeSet<String>(); if (!eCK.equals(NULL_STRING)) { symSet.add(eCK); } if (!genoBaseId.equals(NULL_STRING)) { symSet.addAll(Arrays.asList(StringUtil.split(genoBaseId, "; "))); } if (!synonym.equals(NONE_STRING)) { symSet.addAll(Arrays.asList(synonym.split(", "))); } if (type.equals(TYPE_GENE)) { Item gene = createItem("Gene"); gene.setReference("chromosome", chromosome); gene.setReference("organism", getOrganism(ECOLI_TAXON)); gene.setAttribute("primaryIdentifier", ecogeneId); gene.setAttribute("secondaryIdentifier", wisconsinGenBankId); gene.setAttribute("name", geneName); gene.setAttribute("symbol", geneName); if (symSet.size() > 0) { for (String sym : symSet) { createSynonym(gene, sym, true); } } if (!swissProtId.equals(NULL_STRING)) { if (proteinMap.containsKey(swissProtId)) { // Reference a protein to a gene (a gene has proteins // collection) gene.addToCollection("proteins", proteinMap.get(swissProtId)); } else { Item protein = createItem("Protein"); protein.setAttribute("primaryAccession", swissProtId); // NCBI Protein id, remove "g" protein.setAttribute("secondaryIdentifier", genBankProteinId.substring(1)); gene.addToCollection("proteins", protein); store(protein); proteinMap.put(swissProtId, protein); } } // Create chromosome location if (start.matches(DIGIT_REGEX) && end.matches(DIGIT_REGEX)) { Item location = createItem("Location"); location.setAttribute("start", start); location.setAttribute("end", end); location.setReference("feature", gene); location.setReference("locatedOn", chromosome); if (strand.equals(CLOCKWISE)) { location.setAttribute("strand", "+1"); } else if (strand.equals(COUNTER_CLOCKWISE)) { location.setAttribute("strand", "-1"); } else { location.setAttribute("strand", "0"); } gene.setReference("chromosomeLocation", location); store(location); } store(gene); } else if (type.equals(TYPE_RNA)) { // TODO code refactory Item rna = createItem("NcRNA"); rna.setReference("chromosome", chromosome); rna.setReference("organism", getOrganism(ECOLI_TAXON)); rna.setAttribute("primaryIdentifier", ecogeneId); rna.setAttribute("secondaryIdentifier", wisconsinGenBankId); rna.setAttribute("name", geneName); rna.setAttribute("symbol", geneName); if (symSet.size() > 0) { for (String sym : symSet) { createSynonym(rna, sym, true); } } // Create chromosome location if (start.matches(DIGIT_REGEX) && end.matches(DIGIT_REGEX)) { Item location = createItem("Location"); location.setAttribute("start", start); location.setAttribute("end", end); location.setReference("feature", rna); location.setReference("locatedOn", chromosome); if (strand.equals(CLOCKWISE)) { location.setAttribute("strand", "+1"); } else if (strand.equals(COUNTER_CLOCKWISE)) { location.setAttribute("strand", "-1"); } else { location.setAttribute("strand", "0"); } rna.setReference("chromosomeLocation", location); store(location); } store(rna); } } } }
/** * Process the edge data file. * * @param file the edge data file * @param tfMap a customized map with TF information * @param miRNAMap a customized map with miRNA information */ private void processEdgeFile( File file, Map<String, Map<String, String>> tfMap, Map<String, Map<String, String>> miRNAMap) { try { Reader reader = new FileReader(file); Iterator<?> tsvIter; try { tsvIter = FormattedTextParser.parseTabDelimitedReader(reader); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } while (tsvIter.hasNext()) { String[] line = (String[]) tsvIter.next(); if (line.length > 1) { String sourceIdentifier = line[0]; String targetIdentifier = line[1]; try { if (tfMap.containsKey(sourceIdentifier)) { // Create source gene String sourceSymbol = tfMap.get(sourceIdentifier).keySet().iterator().next(); String sourceLevel = tfMap.get(sourceIdentifier).get(sourceSymbol); String sourceGenePid = createGene(sourceIdentifier, sourceSymbol); // Create networkProperty for source gene Item sourceNetworkProperty = createNetworkProperty(TOPO_TYPE_LEVEL, sourceLevel); sourceNetworkProperty.setReference("node", geneItems.get(sourceGenePid)); store(sourceNetworkProperty); if (tfMap.containsKey(targetIdentifier)) { // Create regulation for both genes Item regulation = createRegulation(INTERACTION_TYPE_TF_TF); // Create target gene String targetSymbol = tfMap.get(targetIdentifier).keySet().iterator().next(); String targetLevel = tfMap.get(targetIdentifier).get(targetSymbol); String targetGenePid = createGene(targetIdentifier, targetSymbol); // Create networkProperty for target gene Item targetNetworkProperty = createNetworkProperty(TOPO_TYPE_LEVEL, targetLevel); targetNetworkProperty.setReference("node", geneItems.get(targetGenePid)); store(targetNetworkProperty); regulation.setReference("source", geneItems.get(sourceGenePid)); regulation.setReference("target", geneItems.get(targetGenePid)); store(regulation); } else if (miRNAMap.containsKey(targetIdentifier)) { // Create regulation for both genes Item regulation = createRegulation(INTERACTION_TYPE_TF_MIRNA); // Create target gene String targetSymbol = miRNAMap.get(targetIdentifier).keySet().iterator().next(); String targetPosition = miRNAMap.get(targetIdentifier).get(targetSymbol); String targetGenePid = createGene(null, targetSymbol); // Create networkProperty for target gene Item targetNetworkProperty = createNetworkProperty(TOPO_TYPE_POSITION, targetPosition); targetNetworkProperty.setReference("node", geneItems.get(targetGenePid)); store(targetNetworkProperty); regulation.setReference("source", geneItems.get(sourceGenePid)); regulation.setReference("target", geneItems.get(targetGenePid)); store(regulation); } else { continue; } } else if (miRNAMap.containsKey(sourceIdentifier)) { // Create source gene String sourceSymbol = miRNAMap.get(sourceIdentifier).keySet().iterator().next(); String sourcePosition = miRNAMap.get(sourceIdentifier).get(sourceSymbol); String sourceGenePid = createGene(null, sourceSymbol); // Create networkProperty for source gene Item sourceNetworkProperty = createNetworkProperty(TOPO_TYPE_POSITION, sourcePosition); sourceNetworkProperty.setReference("node", geneItems.get(sourceGenePid)); store(sourceNetworkProperty); if (tfMap.containsKey(targetIdentifier)) { // Create regulation for both genes Item regulation = createRegulation(INTERACTION_TYPE_MIRNA_TF); // Create target gene String targetSymbol = tfMap.get(targetIdentifier).keySet().iterator().next(); String targetLevel = tfMap.get(targetIdentifier).get(targetSymbol); String targetGenePid = createGene(targetIdentifier, targetSymbol); // Create networkProperty for target gene Item targetNetworkProperty = createNetworkProperty(TOPO_TYPE_LEVEL, targetLevel); targetNetworkProperty.setReference("node", geneItems.get(targetGenePid)); store(targetNetworkProperty); regulation.setReference("source", geneItems.get(sourceGenePid)); regulation.setReference("target", geneItems.get(targetGenePid)); store(regulation); } else if (miRNAMap.containsKey(targetIdentifier)) { // Create regulation for both genes Item regulation = createRegulation(INTERACTION_TYPE_MIRNA_MIRNA); // Create target gene String targetSymbol = miRNAMap.get(targetIdentifier).keySet().iterator().next(); String targetPosition = miRNAMap.get(targetIdentifier).get(targetSymbol); String targetGenePid = createGene(null, targetSymbol); // Create networkProperty for target gene Item targetNetworkProperty = createNetworkProperty(TOPO_TYPE_POSITION, targetPosition); targetNetworkProperty.setReference("node", geneItems.get(targetGenePid)); store(targetNetworkProperty); regulation.setReference("source", geneItems.get(sourceGenePid)); regulation.setReference("target", geneItems.get(targetGenePid)); store(regulation); } else { continue; } } else { continue; } } catch (ObjectStoreException e) { e.printStackTrace(); throw new RuntimeException(e); } } } } catch (FileNotFoundException e) { e.printStackTrace(); throw new RuntimeException(e); } }
/** * Process the csv file * * @param reader the Reader * @see DataConverter#process * @throws Exception if something goes wrong */ @Override public void process(Reader reader) throws Exception { if (rslv == null) { rslv = IdResolverService.getFlyIdResolver(); } Iterator<String[]> it = FormattedTextParser.parseTabDelimitedReader(reader); while (it.hasNext()) { String[] lineBits = it.next(); String geneCG = lineBits[0]; if (!geneCG.startsWith("CG")) { // ignore clones for now continue; } // Try to create/fetch gene, if null the IdResolver failed so do nothing for this row Item gene = getGene(geneCG); if (gene == null) { continue; } String stage = lineBits[1]; String resultKey = geneCG + stage; Item result = getResult(resultKey, gene.getIdentifier(), pub.getIdentifier(), stage); Integer stageNumber = null; try { stageNumber = new Integer(stage); } catch (NumberFormatException e) { // bad line in file, just keep going continue; } result.setAttribute("stageRange", STAGE_LABELS[stageNumber.intValue()] + " (BDGP in situ)"); if (lineBits.length > 2) { String image = lineBits[2]; if (StringUtils.isNotEmpty(image)) { setImage(result, URL + image); } } if (lineBits.length > 3) { String term = lineBits[3]; Item termItem = getTerm(term); if (termItem != null) { result.addToCollection("mRNAExpressionTerms", termItem); } if ("no staining".equals(term)) { result.setAttribute("expressed", "false"); } } } for (Item result : results.values()) { if (!result.hasCollection("mRNAExpressionTerms") || result.getCollection("mRNAExpressionTerms").getRefIds().isEmpty()) { result.setAttribute("expressed", "false"); } } storeAll(imgs); storeAll(results); }