/**
  * Given a graph database gDB, first prune distinct edges with the edgeIndex Then randomly
  * selected chooseN graphs
  *
  * @param gDB
  * @param chooseN
  * @param pruneDistinctEdges
  * @param edgeIndex
  * @return
  * @throws IOException
  * @throws ParseException
  */
 public static Graph[] randomlyChooseDBGraph(
     GraphDatabase_OnDisk gDB, int chooseN, boolean pruneDistinctEdges, EdgeIndex edgeIndex)
     throws IOException, ParseException {
   if (pruneDistinctEdges == false) return randomlyChooseDBGraph(gDB, chooseN);
   // First Step: filter out graphs that are not in between the boundary
   String tempFileName = gDB.getDBFileName() + "_temp";
   BufferedWriter tempDBWriter = new BufferedWriter(new FileWriter(tempFileName));
   int count = 0;
   float edgeNum = 0, nodeNum = 0;
   for (int i = 0; i < gDB.getTotalNum(); i++) {
     Graph theGraph = gDB.findGraph(i);
     if (edgeIndex.containInfrequentEdges(theGraph)) continue;
     else {
       tempDBWriter.write(count++ + " => " + gDB.findGraphString(i) + "\n");
       edgeNum += theGraph.getEdgeCount();
       nodeNum += theGraph.getNodeCount();
     }
   }
   tempDBWriter.close();
   // Intrigue java garbage collector
   Runtime r = Runtime.getRuntime();
   r.gc();
   // Write the meta information of the new file
   BufferedWriter metaWriter = new BufferedWriter(new FileWriter(tempFileName + "_Meta"));
   // 1. Processing Date
   SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
   Date date = new Date();
   metaWriter.write(bartDateFormat.format(date));
   metaWriter.newLine();
   // 2. Number of graphs in this file
   metaWriter.write("Number of Graphs:" + count);
   metaWriter.newLine();
   metaWriter.write(
       "Average EdgeNum: "
           + (float) (edgeNum) / count
           + ", Average NodeNum: "
           + (float) (nodeNum) / count);
   // Close meta data file
   try {
     metaWriter.close();
   } catch (IOException e) {
     e.printStackTrace();
   }
   return randomlyChooseDBGraph(new GraphDatabase_OnDisk(tempFileName, gDB.getParser()), chooseN);
 }
  @Override
  public Document vectorize(int gID, Graph g, Map<Integer, String> gHash) throws ParseException {
    Document gDoc = new Document();

    if (g.getEdgeCount() == 0) return gDoc;
    String graphString = gParser.serialize(g);
    Field stringField = new Field("gString", graphString, Field.Store.YES, Field.Index.NO);
    gDoc.add(stringField);
    Field IDField = new Field("gID", new Integer(gID).toString(), Field.Store.YES, Field.Index.NO);
    gDoc.add(IDField);

    List<Integer> allIDs = searcher.subgraphs(g, new SearchStatus());

    // 0. Add One "-1" to the subGraphs fields [for pure mustNot search]
    gDoc.add(
        new Field(
            "subGraphs", (new Integer(-1)).toString(), Field.Store.NO, Field.Index.NOT_ANALYZED));

    if (allIDs == null || allIDs.size() == 0) return gDoc;

    // 1.
    Collections.sort(allIDs);
    for (int i = 0; i < allIDs.size(); i++) {
      if (gHash == null) {
        String byteString = allIDs.get(i).toString();
        gDoc.add(new Field("subGraphs", byteString, Field.Store.NO, Field.Index.NOT_ANALYZED));
      } else {
        gDoc.add(
            new Field(
                "subGraphs", gHash.get(allIDs.get(i)), Field.Store.NO, Field.Index.NOT_ANALYZED));
      }
    }
    // StringBuffer sBuf = new StringBuffer();
    // for(int i = 0; i< allIDs.length; i++ ){
    // if(gHash==null)
    // sBuf.append(new Integer(allIDs[i]).toString());
    // else {
    // sBuf.append(gHash.get(allIDs[i]));
    // // //TEST
    // // String indexString = gHash.get(allIDs[i]);
    // // String indexString2 = searcher.getLabel(allIDs[i]);
    // // if(!indexString.equals(indexString2)){
    // // System.out.println("lala");
    // // }
    // // //END OF TEST
    // }
    // sBuf.append(" ");
    // }
    // String termString = sBuf.substring(0, sBuf.length()-1);
    //
    // Field subgraphField = new Field("subGraphs", termString,
    // Field.Store.NO, Field.Index.ANALYZED_NO_NORMS);
    // gDoc.add(subgraphField);
    return gDoc;
  }
 public static void saveGDB(Graph[] graphs, GraphParser gParser, String fileName)
     throws IOException {
   // First Step: filter out graphs that are not in between the boundary
   BufferedWriter dbWriter = new BufferedWriter(new FileWriter(fileName));
   int count = 0;
   float edgeNum = 0, nodeNum = 0;
   for (int i = 0; i < graphs.length; i++) {
     Graph theGraph = graphs[i];
     dbWriter.write(count++ + " => " + gParser.serialize(theGraph) + "\n");
     edgeNum += theGraph.getEdgeCount();
     nodeNum += theGraph.getNodeCount();
   }
   dbWriter.close();
   // Intrigue java garbage collector
   Runtime r = Runtime.getRuntime();
   r.gc();
   // Write the meta information of the new file
   BufferedWriter metaWriter = new BufferedWriter(new FileWriter(fileName + "_Meta"));
   // 1. Processing Date
   SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
   Date date = new Date();
   metaWriter.write(bartDateFormat.format(date));
   metaWriter.newLine();
   // 2. Number of graphs in this file
   metaWriter.write("Number of Graphs:" + count);
   metaWriter.newLine();
   metaWriter.write(
       "Average EdgeNum: "
           + (float) (edgeNum) / count
           + ", Average NodeNum: "
           + (float) (nodeNum) / count);
   // Close meta data file
   try {
     metaWriter.close();
   } catch (IOException e) {
     e.printStackTrace();
   }
 }
Пример #4
0
  public static void changeFormatSimple(
      String SDFFileName, String SmilesFileName, GraphFactory graphFactory) throws IOException {
    // Open InputFile
    BufferedReader fileBufReader = new BufferedReader(new FileReader(SDFFileName));
    // Open OutputFile
    BufferedWriter outputWriter = new BufferedWriter(new FileWriter(SmilesFileName));

    SDFParserModified SDFParser = MyFactory.getSDFParserM();
    SmilesParser smilesParser = MyFactory.getSmilesParser();
    Graph oneGraph = null;

    double aveEdgeCount = 0;
    double aveNodeCount = 0;

    int index = 0;
    String spliter = " => ";
    while (fileBufReader.ready()) {
      try {
        oneGraph = SDFParser.parse(fileBufReader, graphFactory);
      } catch (ParseException e) {
        System.out.println("skip one graph");
        while (fileBufReader.ready()) {
          String aLine = fileBufReader.readLine();
          if (aLine.equals("$$$$")) break;
        }
        continue;
      }

      if (GraphConnectivityTester.isConnected(oneGraph)) {
        aveEdgeCount =
            (aveEdgeCount) * (index / ((double) index + 1))
                + oneGraph.getEdgeCount() / ((double) index + 1);
        aveNodeCount =
            (aveNodeCount) * (index / ((double) index + 1))
                + oneGraph.getNodeCount() / ((double) index + 1);
        if (index > 0) outputWriter.newLine();
        outputWriter.write(index + spliter + smilesParser.serialize(oneGraph));
        index++;
      }
    }
    // Close input File
    try {
      fileBufReader.close();
      outputWriter.flush();
      outputWriter.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
    System.out.println(
        "In processor: changeFormat, "
            + index
            + " number of graphs"
            + "has been formated into Smiles Format");
    // Intrigue java garbage collector
    Runtime r = Runtime.getRuntime();
    r.gc();
    // Write the meta information of the smile data file:
    BufferedWriter metaWriter = new BufferedWriter(new FileWriter(SmilesFileName + "_Meta"));
    // 1. Processing Date
    SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
    Date date = new Date();
    metaWriter.write(bartDateFormat.format(date));
    metaWriter.newLine();
    // 2. Number of graphs in this file
    metaWriter.write("Number of Graphs:" + index);
    metaWriter.write("Ave EdgeCount: " + aveEdgeCount + " Ave NodeCount: " + aveNodeCount);
    // Close meta data file
    try {
      metaWriter.flush();
      metaWriter.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  /**
   * @param smilesDBFile
   * @param chooseN
   * @param chosenDBFile
   * @throws ParseException
   * @throws IOException
   */
  public static void randomlyChooseTwoDBGraph(String smilesDBFile, int chooseN, String chosenDBFile)
      throws ParseException, IOException {
    // First get the number of graphs in this database, which can be found in the metadata file
    BufferedReader metaFile = new BufferedReader(new FileReader(smilesDBFile + "_Meta"));
    metaFile.readLine();
    String[] temp = metaFile.readLine().split(":");
    int m = Integer.parseInt(temp[1]);
    metaFile.close();
    int edgeNum = 0, nodeNum = 0;
    // Then randomly select n from m: the first chooseN indexes are sorted as stored in indexes
    int[] indexes = new int[m];
    for (int i = 0; i < m; i++) indexes[i] = i;
    Random rd = new Random();
    int j = 0;
    int swapTemp = 0;
    ;
    for (int i = 0; i < 2 * chooseN; i++) {
      j = (int) (rd.nextFloat() * (m - i)) + i;
      swapTemp = indexes[i];
      indexes[i] = indexes[j];
      indexes[j] = swapTemp;
    }
    Arrays.sort(indexes, 0, 2 * chooseN);
    // Read those db graphs and save them into the new file
    BufferedReader fullDBReader = new BufferedReader(new FileReader(smilesDBFile));
    BufferedWriter chosenDBWriter1 = new BufferedWriter(new FileWriter(chosenDBFile + "_1"));
    BufferedWriter chosenDBWriter2 = new BufferedWriter(new FileWriter(chosenDBFile + "_2"));
    int fileLineIndex = 0;
    int i = 0;
    String aLine = null;
    String spliter = " => ";
    while ((aLine = fullDBReader.readLine()) != null && i < 2 * chooseN) {
      if (fileLineIndex < indexes[i]) {
        fileLineIndex++;
        continue; // keep on reading
      } else if (fileLineIndex == indexes[i] && i < chooseN) {
        // index=> orignalIndex =>smiles
        String gString = aLine.split(spliter)[1];
        Graph g = MyFactory.getSmilesParser().parse(gString, MyFactory.getGraphFactory());
        edgeNum += g.getEdgeCount();
        nodeNum += g.getNodeCount();
        chosenDBWriter1.write(i + spliter + gString);
        chosenDBWriter1.newLine();
        i++;
        fileLineIndex++;
      } else if (fileLineIndex == indexes[i] && i >= chooseN) {
        // index=> orignalIndex =>smiles
        String gString = aLine.split(spliter)[1];
        Graph g = MyFactory.getSmilesParser().parse(gString, MyFactory.getGraphFactory());
        edgeNum += g.getEdgeCount();
        nodeNum += g.getNodeCount();
        chosenDBWriter2.write(i + spliter + gString);
        chosenDBWriter2.newLine();
        i++;
        fileLineIndex++;
      } else if (fileLineIndex > indexes[i])
        System.out.println("Exception: Processor: randomlyChooseDBGraph");
    }

    // Close out File
    try {
      chosenDBWriter1.close();
      chosenDBWriter2.close();
      fullDBReader.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
    // Intrigue java garbage collector
    Runtime r = Runtime.getRuntime();
    r.gc();
    // Write the meta information of the new file
    BufferedWriter metaWriter = new BufferedWriter(new FileWriter(chosenDBFile + "_1" + "_Meta"));
    // 1. Processing Date
    SimpleDateFormat bartDateFormat = new SimpleDateFormat("EEEE-MMMM-dd-yyyy");
    Date date = new Date();
    metaWriter.write(bartDateFormat.format(date));
    metaWriter.newLine();
    // 2. Number of graphs in this file
    metaWriter.write("Number of Graphs:" + i);
    metaWriter.newLine();
    metaWriter.write(
        "Average EdgeNum: "
            + (float) (edgeNum) / i
            + ", Average NodeNum: "
            + (float) (nodeNum) / i);
    // Close meta data file
    try {
      metaWriter.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }