Ejemplo n.º 1
0
  private static long createGenomeElementNode(
      String version,
      String comment,
      String definition,
      BatchInserter inserter,
      BatchInserterIndex genomeElementVersionIndex,
      BatchInserterIndex nodeTypeIndex) {

    genomeElementProperties.put(GenomeElementNode.VERSION_PROPERTY, version);
    genomeElementProperties.put(GenomeElementNode.COMMENT_PROPERTY, comment);
    genomeElementProperties.put(GenomeElementNode.DEFINITION_PROPERTY, definition);

    long genomeElementId = inserter.createNode(genomeElementProperties);
    genomeElementVersionIndex.add(
        genomeElementId, MapUtil.map(GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX, version));
    nodeTypeIndex.add(
        genomeElementId,
        MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, GenomeElementNode.NODE_TYPE));

    return genomeElementId;
  }
Ejemplo n.º 2
0
  public static void main(String[] args) {

    if (args.length != 3) {
      System.out.println(
          "This program expects the following parameters: \n"
              + "1. Folder name with all the .gbk files \n"
              + "2. Bio4j DB folder \n"
              + "3. batch inserter .properties file");
    } else {

      File currentFolder = new File(args[0]);

      File[] files = currentFolder.listFiles();

      BatchInserter inserter = null;
      BatchInserterIndexProvider indexProvider = null;

      // ----------------------------------------------------------------------------------
      // ---------------------initializing node type properties----------------------------
      genomeElementProperties.put(
          GenomeElementNode.NODE_TYPE_PROPERTY, GenomeElementNode.NODE_TYPE);
      geneProperties.put(GeneNode.NODE_TYPE_PROPERTY, GeneNode.NODE_TYPE);
      cdsProperties.put(CDSNode.NODE_TYPE_PROPERTY, CDSNode.NODE_TYPE);
      miscRnaProperties.put(MiscRNANode.NODE_TYPE_PROPERTY, MiscRNANode.NODE_TYPE);
      mRnaProperties.put(MRNANode.NODE_TYPE_PROPERTY, MRNANode.NODE_TYPE);
      ncRnaProperties.put(NcRNANode.NODE_TYPE_PROPERTY, NcRNANode.NODE_TYPE);
      rRnaProperties.put(RRNANode.NODE_TYPE_PROPERTY, RRNANode.NODE_TYPE);
      tmRnaProperties.put(TmRNANode.NODE_TYPE_PROPERTY, TmRNANode.NODE_TYPE);
      tRnaProperties.put(TRNANode.NODE_TYPE_PROPERTY, TRNANode.NODE_TYPE);
      // ----------------------------------------------------------------------------------
      // ----------------------------------------------------------------------------------

      try {
        // This block configures the logger with handler and formatter
        fh = new FileHandler("ImportRefSeq.log", false);

        SimpleFormatter formatter = new SimpleFormatter();
        fh.setFormatter(formatter);
        logger.addHandler(fh);
        logger.setLevel(Level.ALL);

        // create the batch inserter
        inserter = BatchInserters.inserter(args[1], MapUtil.load(new File(args[2])));

        // create the batch index service
        indexProvider = new LuceneBatchInserterIndexProvider(inserter);

        // -----------------create batch indexes----------------------------------
        // ----------------------------------------------------------------------
        BatchInserterIndex genomeElementVersionIndex =
            indexProvider.nodeIndex(
                GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX,
                MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));
        BatchInserterIndex nodeTypeIndex =
            indexProvider.nodeIndex(
                Bio4jManager.NODE_TYPE_INDEX_NAME,
                MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));

        for (File file : files) {
          if (file.getName().endsWith(".gbff")) {

            logger.log(Level.INFO, ("file: " + file.getName()));

            BufferedReader reader = new BufferedReader(new FileReader(file));
            String line = null;

            while ((line = reader.readLine()) != null) {

              // this is the first line where the locus is
              String accessionSt = "";
              String definitionSt = "";
              String versionSt = "";
              String commentSt = "";
              StringBuilder seqStBuilder = new StringBuilder();

              ArrayList<String> cdsList = new ArrayList<String>();
              ArrayList<String> geneList = new ArrayList<String>();
              ArrayList<String> miscRnaList = new ArrayList<String>();
              ArrayList<String> mRnaList = new ArrayList<String>();
              ArrayList<String> ncRnaList = new ArrayList<String>();
              ArrayList<String> rRnaList = new ArrayList<String>();
              ArrayList<String> tmRnaList = new ArrayList<String>();
              ArrayList<String> tRnaList = new ArrayList<String>();

              boolean originFound = false;

              // Now I get all the lines till I reach the string '//'
              do {
                boolean readLineFlag = true;

                if (line.startsWith(GBCommon.LOCUS_STR)) {
                  // do nothing right now
                } else if (line.startsWith(GBCommon.ACCESSION_STR)) {

                  accessionSt = line.split(GBCommon.ACCESSION_STR)[1].trim();

                } else if (line.startsWith(GBCommon.VERSION_STR)) {

                  versionSt = line.split(GBCommon.VERSION_STR)[1].trim().split(" ")[0];

                } else if (line.startsWith(GBCommon.DEFINITION_STR)) {

                  definitionSt += line.split(GBCommon.DEFINITION_STR)[1].trim();
                  do {
                    line = reader.readLine();
                    if (line.startsWith(" ")) {
                      definitionSt += line.trim();
                    }
                  } while (line.startsWith(" "));
                  readLineFlag = false;

                } else if (line.startsWith(GBCommon.COMMENT_STR)) {

                  commentSt += line.split(GBCommon.COMMENT_STR)[1].trim();
                  do {
                    line = reader.readLine();
                    if (line.startsWith(" ")) {
                      commentSt += "\n" + line.trim();
                    }
                  } while (line.startsWith(" "));
                  readLineFlag = false;

                } else if (line.startsWith(GBCommon.FEATURES_STR)) {

                  do {
                    line = reader.readLine();

                    String lineSubstr5 = line.substring(5);

                    if (lineSubstr5.startsWith(GBCommon.CDS_STR)) {
                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.CDS_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      cdsList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.GENE_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.GENE_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      geneList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.MISC_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.MISC_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      miscRnaList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.TM_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.TM_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      tmRnaList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.R_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.R_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      rRnaList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.M_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.M_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      mRnaList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.NC_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.NC_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      ncRnaList.add(positionsSt);

                    } else if (lineSubstr5.startsWith(GBCommon.T_RNA_STR)) {

                      String positionsSt = "";
                      positionsSt += line.trim().split(GBCommon.T_RNA_STR)[1].trim();

                      line = reader.readLine();

                      while (!line.trim().startsWith("/")) {
                        positionsSt += line.trim();
                        line = reader.readLine();
                      }

                      tRnaList.add(positionsSt);
                    }

                  } while (line.startsWith(" "));
                  readLineFlag = false;

                } else if (line.startsWith(GBCommon.ORIGIN_STR)) {

                  originFound = true;

                  do {
                    line = reader.readLine();
                    String[] tempArray = line.trim().split(" ");
                    for (int i = 1; i < tempArray.length; i++) {
                      seqStBuilder.append(tempArray[i]);
                    }

                  } while (line.startsWith(" "));
                  readLineFlag = false;
                }

                if (readLineFlag) {
                  line = reader.readLine();
                }

              } while (line != null && !line.startsWith(GBCommon.LAST_LINE_STR));

              // --------create genome element node--------------
              long genomeElementId =
                  createGenomeElementNode(
                      versionSt,
                      commentSt,
                      definitionSt,
                      inserter,
                      genomeElementVersionIndex,
                      nodeTypeIndex);

              // -----------genes-----------------
              for (String genePositionsSt : geneList) {
                geneProperties.put(GeneNode.POSITIONS_PROPERTY, genePositionsSt);
                long geneId = inserter.createNode(geneProperties);
                inserter.createRelationship(genomeElementId, geneId, genomeElementGeneRel, null);

                // indexing gene node by its node_type
                nodeTypeIndex.add(
                    geneId, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, GeneNode.NODE_TYPE));
              }

              // -----------CDS-----------------
              for (String cdsPositionsSt : cdsList) {
                cdsProperties.put(CDSNode.POSITIONS_PROPERTY, cdsPositionsSt);
                long cdsID = inserter.createNode(cdsProperties);
                inserter.createRelationship(genomeElementId, cdsID, genomeElementCDSRel, null);

                // indexing CDS node by its node_type
                nodeTypeIndex.add(
                    cdsID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, CDSNode.NODE_TYPE));
              }

              // -----------misc rna-----------------
              for (String miscRnaPositionsSt : miscRnaList) {
                miscRnaProperties.put(MiscRNANode.POSITIONS_PROPERTY, miscRnaPositionsSt);
                long miscRnaID = inserter.createNode(miscRnaProperties);
                inserter.createRelationship(
                    genomeElementId, miscRnaID, genomeElementMiscRnaRel, null);

                // indexing MiscRNA node by its node_type
                nodeTypeIndex.add(
                    miscRnaID,
                    MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, MiscRNANode.NODE_TYPE));
              }

              // -----------m rna-----------------
              for (String mRnaPositionsSt : mRnaList) {
                mRnaProperties.put(MRNANode.POSITIONS_PROPERTY, mRnaPositionsSt);
                long mRnaID = inserter.createNode(mRnaProperties);
                inserter.createRelationship(genomeElementId, mRnaID, genomeElementMRnaRel, null);

                // indexing MRNA node by its node_type
                nodeTypeIndex.add(
                    mRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, MRNANode.NODE_TYPE));
              }

              // -----------nc rna-----------------
              for (String ncRnaPositionsSt : ncRnaList) {
                ncRnaProperties.put(NcRNANode.POSITIONS_PROPERTY, ncRnaPositionsSt);
                long ncRnaID = inserter.createNode(ncRnaProperties);
                inserter.createRelationship(genomeElementId, ncRnaID, genomeElementNcRnaRel, null);

                // indexing NCRNA node by its node_type
                nodeTypeIndex.add(
                    ncRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, NcRNANode.NODE_TYPE));
              }

              // -----------r rna-----------------
              for (String rRnaPositionsSt : rRnaList) {
                rRnaProperties.put(RRNANode.POSITIONS_PROPERTY, rRnaPositionsSt);
                long rRnaID = inserter.createNode(rRnaProperties);
                inserter.createRelationship(genomeElementId, rRnaID, genomeElementRRnaRel, null);

                // indexing RRNA node by its node_type
                nodeTypeIndex.add(
                    rRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, RRNANode.NODE_TYPE));
              }

              // -----------tm rna-----------------
              for (String tmRnaPositionsSt : tmRnaList) {
                tmRnaProperties.put(TmRNANode.POSITIONS_PROPERTY, tmRnaPositionsSt);
                long tmRnaID = inserter.createNode(tmRnaProperties);
                inserter.createRelationship(genomeElementId, tmRnaID, genomeElementTmRnaRel, null);

                // indexing TmRNA node by its node_type
                nodeTypeIndex.add(
                    tmRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, TmRNANode.NODE_TYPE));
              }

              // -----------t rna-----------------
              for (String tRnaPositionsSt : tRnaList) {
                tRnaProperties.put(TRNANode.POSITIONS_PROPERTY, tRnaPositionsSt);
                long tRnaID = inserter.createNode(tRnaProperties);
                inserter.createRelationship(genomeElementId, tRnaID, genomeElementTRnaRel, null);

                // indexing TRNA node by its node_type
                nodeTypeIndex.add(
                    tRnaID, MapUtil.map(Bio4jManager.NODE_TYPE_INDEX_NAME, TRNANode.NODE_TYPE));
              }

              logger.log(Level.INFO, (versionSt + " saved!"));
            }
          }
        }

      } catch (Exception e) {
        logger.log(Level.SEVERE, e.getMessage());
        StackTraceElement[] trace = e.getStackTrace();
        for (StackTraceElement stackTraceElement : trace) {
          logger.log(Level.SEVERE, stackTraceElement.toString());
        }
      } finally {

        // shutdown, makes sure all changes are written to disk
        indexProvider.shutdown();
        inserter.shutdown();

        // closing logger file handler
        fh.close();
      }
    }
  }