Ejemplo n.º 1
0
  public void importFromFile(String filePath) throws IOException {
    Map<String, Long> cache = new HashMap<String, Long>(COUNT);
    final File storeDir = new File(this.path);
    org.apache.commons.io.FileUtils.deleteDirectory(storeDir);
    BatchInserter batchInserter = new BatchInserterImpl(storeDir.getAbsolutePath());
    final BatchInserterIndexProvider indexProvider =
        new LuceneBatchInserterIndexProvider(batchInserter);
    final BatchInserterIndex index =
        indexProvider.nodeIndex("nodes", MapUtil.stringMap("type", "exact"));
    BufferedReader reader = new BufferedReader(new FileReader(filePath));
    String line = null;
    int nodes = 0;
    long time = System.currentTimeMillis();
    long batchTime = time;
    while ((line = reader.readLine()) != null) {
      final String[] nodeNames = line.split("\\|");
      final String name = nodeNames[0];
      final Map<String, Object> props = MapUtil.map("name", name);
      final long node = batchInserter.createNode(props);
      index.add(node, props);
      cache.put(name, node);
      nodes++;
      if ((nodes % REPORT_COUNT) == 0) {
        System.out.printf(
            "%d nodes created. Took %d %n", nodes, (System.currentTimeMillis() - batchTime));
        batchTime = System.currentTimeMillis();
      }
    }

    System.out.println("Creating nodes took " + (System.currentTimeMillis() - time) / 1000);
    index.flush();
    reader.close();
    reader = new BufferedReader(new FileReader(filePath));
    int rels = 0;
    time = System.currentTimeMillis();
    batchTime = time;
    String relationshipType = "KNOWS";
    while ((line = reader.readLine()) != null) {
      final String[] nodeNames = line.split("\\|");
      final String name = nodeNames[0];
      // final Long from = index.get("name", name).getSingle();
      Long from = cache.get(name);
      for (int j = 1; j < nodeNames.length; j++) {
        // final Long to = index.get("name", nodeNames[j]).getSingle();
        final Long to = cache.get(name);
        batchInserter.createRelationship(
            from, to, DynamicRelationshipType.withName(relationshipType), null);
      }
      rels++;
      if ((rels % REPORT_COUNT) == 0) {
        System.out.printf(
            "%d relationships created. Took %d %n", rels, (System.currentTimeMillis() - batchTime));
        batchTime = System.currentTimeMillis();
      }
    }
    System.out.println("Creating relationships took " + (System.currentTimeMillis() - time) / 1000);
    indexProvider.shutdown();
    batchInserter.shutdown();
  }
Ejemplo n.º 2
0
  public static void main(String[] args) {

    File currentFolder = new File(".");

    File[] files = currentFolder.listFiles();

    BatchInserter inserter = null;
    BatchInserterIndexProvider indexProvider = null;

    // ----------------------------------------------------------------------------------
    // ---------------------initializing node type properties----------------------------
    genomeElementProperties.put(GenomeElementNode.NODE_TYPE_PROPERTY, GenomeElementNode.NODE_TYPE);
    geneProperties.put(GeneNode.NODE_TYPE_PROPERTY, GeneNode.NODE_TYPE);
    cdsProperties.put(CDSNode.NODE_TYPE_PROPERTY, CDSNode.NODE_TYPE);
    miscRnaProperties.put(MiscRNANode.NODE_TYPE_PROPERTY, MiscRNANode.NODE_TYPE);
    mRnaProperties.put(MRNANode.NODE_TYPE_PROPERTY, MRNANode.NODE_TYPE);
    ncRnaProperties.put(NcRNANode.NODE_TYPE_PROPERTY, NcRNANode.NODE_TYPE);
    rRnaProperties.put(RRNANode.NODE_TYPE_PROPERTY, RRNANode.NODE_TYPE);
    tmRnaProperties.put(TmRNANode.NODE_TYPE_PROPERTY, TmRNANode.NODE_TYPE);
    tRnaProperties.put(TRNANode.NODE_TYPE_PROPERTY, TRNANode.NODE_TYPE);
    // ----------------------------------------------------------------------------------
    // ----------------------------------------------------------------------------------

    try {
      // This block configures the logger with handler and formatter
      fh = new FileHandler("ImportGenbank.log", false);

      SimpleFormatter formatter = new SimpleFormatter();
      fh.setFormatter(formatter);
      logger.addHandler(fh);
      logger.setLevel(Level.ALL);

      // create the batch inserter
      inserter =
          new BatchInserterImpl(
              CommonData.DATABASE_FOLDER,
              BatchInserterImpl.loadProperties(CommonData.PROPERTIES_FILE_NAME));

      // create the batch index service
      indexProvider = new LuceneBatchInserterIndexProvider(inserter);

      // -----------------create batch indexes----------------------------------
      // ----------------------------------------------------------------------
      BatchInserterIndex genomeElementVersionIndex =
          indexProvider.nodeIndex(
              GenomeElementNode.GENOME_ELEMENT_VERSION_INDEX,
              MapUtil.stringMap(PROVIDER_ST, LUCENE_ST, TYPE_ST, EXACT_ST));

      for (File file : files) {
        if (file.getName().endsWith(".gbff")) {

          BufferedReader reader = new BufferedReader(new FileReader(file));
          String line = null;

          while ((line = reader.readLine()) != null) {

            // this is the first line where the locus is
            String accessionSt = "";
            String definitionSt = "";
            String versionSt = "";
            String commentSt = "";
            StringBuilder seqStBuilder = new StringBuilder();

            ArrayList<String> cdsList = new ArrayList<String>();
            ArrayList<String> geneList = new ArrayList<String>();
            ArrayList<String> miscRnaList = new ArrayList<String>();
            ArrayList<String> mRnaList = new ArrayList<String>();
            ArrayList<String> ncRnaList = new ArrayList<String>();
            ArrayList<String> rRnaList = new ArrayList<String>();
            ArrayList<String> tmRnaList = new ArrayList<String>();
            ArrayList<String> tRnaList = new ArrayList<String>();

            boolean originFound = false;

            // Now I get all the lines till I reach the string '//'
            do {
              boolean readLineFlag = true;

              if (line.startsWith(GBCommon.LOCUS_STR)) {
                // do nothing right now
              } else if (line.startsWith(GBCommon.ACCESSION_STR)) {

                accessionSt = line.split(GBCommon.ACCESSION_STR)[1].trim();

              } else if (line.startsWith(GBCommon.VERSION_STR)) {

                versionSt = line.split(GBCommon.VERSION_STR)[1].trim().split(" ")[0];

              } else if (line.startsWith(GBCommon.DEFINITION_STR)) {

                definitionSt += line.split(GBCommon.DEFINITION_STR)[1].trim();
                do {
                  line = reader.readLine();
                  if (line.startsWith(" ")) {
                    definitionSt += line.trim();
                  }
                } while (line.startsWith(" "));
                readLineFlag = false;

              } else if (line.startsWith(GBCommon.COMMENT_STR)) {

                commentSt += line.split(GBCommon.COMMENT_STR)[1].trim();
                do {
                  line = reader.readLine();
                  if (line.startsWith(" ")) {
                    commentSt += "\n" + line.trim();
                  }
                } while (line.startsWith(" "));
                readLineFlag = false;

              } else if (line.startsWith(GBCommon.FEATURES_STR)) {

                do {
                  line = reader.readLine();

                  if (line.trim().startsWith(GBCommon.CDS_STR)) {
                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.CDS_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    cdsList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.GENE_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.GENE_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    geneList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.MISC_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.MISC_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    miscRnaList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.TM_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.TM_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    tmRnaList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.R_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.R_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    rRnaList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.M_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.M_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    mRnaList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.NC_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.NC_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    ncRnaList.add(positionsSt);

                  } else if (line.trim().startsWith(GBCommon.T_RNA_STR)) {

                    String positionsSt = "";
                    positionsSt += line.trim().split(GBCommon.T_RNA_STR)[1].trim();

                    line = reader.readLine();

                    while (!line.trim().startsWith("/")) {
                      positionsSt += line.trim();
                      line = reader.readLine();
                    }

                    tRnaList.add(positionsSt);
                  }

                } while (line.startsWith(" "));
                readLineFlag = false;

              } else if (line.startsWith(GBCommon.ORIGIN_STR)) {
                // sequence

                originFound = true;

                do {
                  line = reader.readLine();
                  String[] tempArray = line.trim().split(" ");
                  for (int i = 1; i < tempArray.length; i++) {
                    seqStBuilder.append(tempArray[i]);
                  }

                } while (line.startsWith(" "));
                readLineFlag = false;
              }

              if (readLineFlag) {
                line = reader.readLine();
              }

            } while (line != null && !line.startsWith(GBCommon.LAST_LINE_STR));

            // -----we only save the data when the sequence is found------------
            if (originFound) {

              System.out.println("accessionSt = " + accessionSt);
              System.out.println("versionSt = " + versionSt);
              System.out.println("definitionSt = " + definitionSt);
              System.out.println("commentSt = " + commentSt);
              System.out.println("sequence.length = " + seqStBuilder.toString().length());

              System.out.println("geneList = " + geneList);
              System.out.println("cdsList = " + cdsList);
              System.out.println("miscRnaList = " + miscRnaList);
              System.out.println("mRnaList = " + mRnaList);
              System.out.println("ncRnaList = " + ncRnaList);
              System.out.println("rRnaList = " + rRnaList);
              System.out.println("tmRnaList = " + tmRnaList);
              System.out.println("tRnaList = " + tRnaList);

              // --------create genome element node--------------
              long genomeElementId =
                  createGenomeElementNode(
                      versionSt, commentSt, definitionSt, inserter, genomeElementVersionIndex);

              // -----------genes-----------------
              for (String genePositionsSt : geneList) {
                geneProperties.put(GeneNode.POSITIONS_PROPERTY, genePositionsSt);
                long geneId = inserter.createNode(geneProperties);
                inserter.createRelationship(genomeElementId, geneId, genomeElementGeneRel, null);
              }

              // -----------CDS-----------------
              for (String cdsPositionsSt : cdsList) {
                cdsProperties.put(CDSNode.POSITIONS_PROPERTY, cdsPositionsSt);
                long cdsID = inserter.createNode(cdsProperties);
                inserter.createRelationship(genomeElementId, cdsID, genomeElementCDSRel, null);
              }

              // -----------misc rna-----------------
              for (String miscRnaPositionsSt : miscRnaList) {
                miscRnaProperties.put(MiscRNANode.POSITIONS_PROPERTY, miscRnaPositionsSt);
                long miscRnaID = inserter.createNode(miscRnaProperties);
                inserter.createRelationship(
                    genomeElementId, miscRnaID, genomeElementMiscRnaRel, null);
              }

              // -----------m rna-----------------
              for (String mRnaPositionsSt : mRnaList) {
                mRnaProperties.put(MRNANode.POSITIONS_PROPERTY, mRnaPositionsSt);
                long mRnaID = inserter.createNode(mRnaProperties);
                inserter.createRelationship(genomeElementId, mRnaID, genomeElementMRnaRel, null);
              }

              // -----------nc rna-----------------
              for (String ncRnaPositionsSt : ncRnaList) {
                ncRnaProperties.put(NcRNANode.POSITIONS_PROPERTY, ncRnaPositionsSt);
                long ncRnaID = inserter.createNode(ncRnaProperties);
                inserter.createRelationship(genomeElementId, ncRnaID, genomeElementNcRnaRel, null);
              }

              // -----------r rna-----------------
              for (String rRnaPositionsSt : rRnaList) {
                rRnaProperties.put(RRNANode.POSITIONS_PROPERTY, rRnaPositionsSt);
                long rRnaID = inserter.createNode(rRnaProperties);
                inserter.createRelationship(genomeElementId, rRnaID, genomeElementRRnaRel, null);
              }

              // -----------tm rna-----------------
              for (String tmRnaPositionsSt : tmRnaList) {
                tmRnaProperties.put(TmRNANode.POSITIONS_PROPERTY, tmRnaPositionsSt);
                long tmRnaID = inserter.createNode(tmRnaProperties);
                inserter.createRelationship(genomeElementId, tmRnaID, genomeElementTmRnaRel, null);
              }

              // -----------t rna-----------------
              for (String tRnaPositionsSt : tRnaList) {
                tRnaProperties.put(TRNANode.POSITIONS_PROPERTY, tRnaPositionsSt);
                long tRnaID = inserter.createNode(tRnaProperties);
                inserter.createRelationship(genomeElementId, tRnaID, genomeElementTRnaRel, null);
              }
            }
          }
        }
      }

    } catch (Exception e) {
      logger.log(Level.SEVERE, e.getMessage());
      StackTraceElement[] trace = e.getStackTrace();
      for (StackTraceElement stackTraceElement : trace) {
        logger.log(Level.SEVERE, stackTraceElement.toString());
      }
    } finally {

      // shutdown, makes sure all changes are written to disk
      indexProvider.shutdown();
      inserter.shutdown();

      // closing logger file handler
      fh.close();
    }
  }