コード例 #1
0
 /**
  * Creates the {@link Document} for {@link KnowledgeBasePathIndex}
  *
  * @param kb the knowledge base to index
  * @throws IOException
  */
 public static void createIndex(KnowledgeBase kb) throws IOException {
   File txtFileDir =
       new File(
           KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR);
   File indexDir =
       new File(KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + INDEX_DIR);
   createIndex(txtFileDir, indexDir);
 }
コード例 #2
0
  /**
   * Creates the files containing all paths for all concepts up to a certain depth
   *
   * @param kb
   * @param concepts
   * @throws IOException
   */
  public static void createIndexFiles(KnowledgeBase kb, Collection<String> concepts)
      throws IOException {
    Iterator<String> conceptIterator = kb.getConceptIterator();
    String outputDir =
        KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR;
    if (!Files.isDirectoryEmpty(new File(outputDir))) {
      log.warn(outputDir + " IS NOT EMPTY, BAILING OUT ...");
      return;
    }

    int maxSearchDepth = getMaxSearchDepth();
    DirectoryFileManager dirManager = new DirectoryFileManager(outputDir, 1000);
    log.info("CREATING THE INDEXABLE PATH FILES FOR: " + kb.name() + " INTO " + outputDir);

    int idx = 0;
    List<String> goodConcepts = new ArrayList<String>();
    while (conceptIterator.hasNext()) {
      String concept = conceptIterator.next();
      if (concepts.isEmpty() || concepts.contains(concept)) goodConcepts.add(concept);

      idx++;
      if ((idx % 100000) == 0) log.info("ITERATED THROUGH " + idx + " CONCEPTS SO FAR ...");
    }

    List<List<String>> splitConcepts =
        Collections.split(goodConcepts, goodConcepts.size() / NSPLITS);
    int nThreads = splitConcepts.size();
    ExecutorService threadExecutor = Executors.newFixedThreadPool(nThreads);
    CountDownLatch doneSignal = new CountDownLatch(nThreads);

    for (int i = 0; i < nThreads; i++) {
      ConceptIndexer worker =
          new ConceptIndexer(splitConcepts.get(i), i, dirManager, kb, maxSearchDepth, doneSignal);
      threadExecutor.execute(worker);
    }

    Timer timer = new Timer();
    try {
      doneSignal.await();
      threadExecutor.shutdown();
    } catch (InterruptedException ie) {
      ie.printStackTrace();
    }
    timer.tick("TO CREATE INDEXABLE FILES");
  }
コード例 #3
0
  /**
   * Creates the {@link Document} for {@link KnowledgeBasePathIndex} for a bunch of input concepts
   * only
   *
   * @param kb
   * @throws IOException
   */
  public static void createIndex(KnowledgeBase kb, Collection<String> concepts) throws IOException {
    File txtFileDir =
        new File(
            KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR);
    // init the index dirs and the index writer
    File indexDir =
        new File(KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + INDEX_DIR);

    if (!indexDir.exists()) indexDir.mkdirs();
    else if (!Files.isDirectoryEmpty(indexDir)) {
      log.warn(indexDir + " IS NOT EMPTY, BAILING OUT ...");
      return;
    }

    IndexWriter indexWriter =
        new IndexWriter(
            new SimpleFSDirectory(indexDir),
            new StandardAnalyzer(Version.LUCENE_29),
            true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    int counter = 0;
    FilenameFilter filter =
        new FilenameFilter() {
          public boolean accept(File dir, String name) {
            return name.endsWith(".txt");
          }
        };
    Collection<File> files = Files.listFiles(txtFileDir, filter, true);

    // the intermediate concepts @depth 2 from the starting nodes
    Set<String> intermediate = new HashSet<String>();

    // PHASE 1. index all paths starting with concepts of interest
    for (File file : files) {
      String fileName = file.getName();
      String concept = fileName.substring(0, fileName.indexOf("."));
      if (!concepts.contains(concept)) continue;

      BufferedReader reader = new BufferedReader(new FileReader(file));

      Set<List<String>> paths = new HashSet<List<String>>();
      while (reader.ready()) {
        String line = reader.readLine();
        String[] path = line.split(" ");
        paths.add(Arrays.asList(path));
        intermediate.add(path[path.length - 1]);
      }
      reader.close();

      List<Document> docs = getRecords(paths);
      for (Document doc : docs) indexWriter.addDocument(doc);

      if ((counter % 1000) == 0)
        log.info("\tINDEXED STARTING PATHS FOR " + counter + " CONCEPTS SO FAR ... ");
      counter++;
    }
    log.info("\tINDEXED " + counter + " STARTING CONCEPTS, DONE!");

    log.info("NOW INDEXING ENDING PATHS FOR " + intermediate.size() + " INTERMEDIATE CONCEPTS");

    // PHASE 2. index all paths ending with concepts of interest
    counter = 0;
    for (File file : files) {
      String fileName = file.getName();
      String concept = fileName.substring(0, fileName.indexOf("."));
      if (concepts.contains(concept)) continue; // already seen
      if (!intermediate.contains(concept)) continue; // interested only
      // in intermediate

      BufferedReader reader = new BufferedReader(new FileReader(file));

      Set<List<String>> paths = new HashSet<List<String>>();
      while (reader.ready()) {
        String line = reader.readLine();
        String[] path = line.split(" ");
        if (concepts.contains(path[path.length - 1])) paths.add(Arrays.asList(path));
      }
      reader.close();

      List<Document> docs = getRecords(paths);
      for (Document doc : docs) indexWriter.addDocument(doc);

      if ((counter % 1000) == 0)
        log.info("\tINDEXED ENDING PATHS FOR " + counter + " CONCEPTS SO FAR ... ");
      counter++;
    }
    log.info("\tINDEXED " + counter + " ENDING CONCEPTS, DONE!");

    // closes the index writers
    indexWriter.optimize();
    indexWriter.close();
  }