/** * Creates the {@link Document} for {@link KnowledgeBasePathIndex} * * @param kb the knowledge base to index * @throws IOException */ public static void createIndex(KnowledgeBase kb) throws IOException { File txtFileDir = new File( KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR); File indexDir = new File(KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + INDEX_DIR); createIndex(txtFileDir, indexDir); }
/** * Creates the files containing all paths for all concepts up to a certain depth * * @param kb * @param concepts * @throws IOException */ public static void createIndexFiles(KnowledgeBase kb, Collection<String> concepts) throws IOException { Iterator<String> conceptIterator = kb.getConceptIterator(); String outputDir = KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR; if (!Files.isDirectoryEmpty(new File(outputDir))) { log.warn(outputDir + " IS NOT EMPTY, BAILING OUT ..."); return; } int maxSearchDepth = getMaxSearchDepth(); DirectoryFileManager dirManager = new DirectoryFileManager(outputDir, 1000); log.info("CREATING THE INDEXABLE PATH FILES FOR: " + kb.name() + " INTO " + outputDir); int idx = 0; List<String> goodConcepts = new ArrayList<String>(); while (conceptIterator.hasNext()) { String concept = conceptIterator.next(); if (concepts.isEmpty() || concepts.contains(concept)) goodConcepts.add(concept); idx++; if ((idx % 100000) == 0) log.info("ITERATED THROUGH " + idx + " CONCEPTS SO FAR ..."); } List<List<String>> splitConcepts = Collections.split(goodConcepts, goodConcepts.size() / NSPLITS); int nThreads = splitConcepts.size(); ExecutorService threadExecutor = Executors.newFixedThreadPool(nThreads); CountDownLatch doneSignal = new CountDownLatch(nThreads); for (int i = 0; i < nThreads; i++) { ConceptIndexer worker = new ConceptIndexer(splitConcepts.get(i), i, dirManager, kb, maxSearchDepth, doneSignal); threadExecutor.execute(worker); } Timer timer = new Timer(); try { doneSignal.await(); threadExecutor.shutdown(); } catch (InterruptedException ie) { ie.printStackTrace(); } timer.tick("TO CREATE INDEXABLE FILES"); }
/** * Creates the {@link Document} for {@link KnowledgeBasePathIndex} for a bunch of input concepts * only * * @param kb * @throws IOException */ public static void createIndex(KnowledgeBase kb, Collection<String> concepts) throws IOException { File txtFileDir = new File( KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR); // init the index dirs and the index writer File indexDir = new File(KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + INDEX_DIR); if (!indexDir.exists()) indexDir.mkdirs(); else if (!Files.isDirectoryEmpty(indexDir)) { log.warn(indexDir + " IS NOT EMPTY, BAILING OUT ..."); return; } IndexWriter indexWriter = new IndexWriter( new SimpleFSDirectory(indexDir), new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED); int counter = 0; FilenameFilter filter = new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); } }; Collection<File> files = Files.listFiles(txtFileDir, filter, true); // the intermediate concepts @depth 2 from the starting nodes Set<String> intermediate = new HashSet<String>(); // PHASE 1. index all paths starting with concepts of interest for (File file : files) { String fileName = file.getName(); String concept = fileName.substring(0, fileName.indexOf(".")); if (!concepts.contains(concept)) continue; BufferedReader reader = new BufferedReader(new FileReader(file)); Set<List<String>> paths = new HashSet<List<String>>(); while (reader.ready()) { String line = reader.readLine(); String[] path = line.split(" "); paths.add(Arrays.asList(path)); intermediate.add(path[path.length - 1]); } reader.close(); List<Document> docs = getRecords(paths); for (Document doc : docs) indexWriter.addDocument(doc); if ((counter % 1000) == 0) log.info("\tINDEXED STARTING PATHS FOR " + counter + " CONCEPTS SO FAR ... "); counter++; } log.info("\tINDEXED " + counter + " STARTING CONCEPTS, DONE!"); log.info("NOW INDEXING ENDING PATHS FOR " + intermediate.size() + " INTERMEDIATE CONCEPTS"); // PHASE 2. index all paths ending with concepts of interest counter = 0; for (File file : files) { String fileName = file.getName(); String concept = fileName.substring(0, fileName.indexOf(".")); if (concepts.contains(concept)) continue; // already seen if (!intermediate.contains(concept)) continue; // interested only // in intermediate BufferedReader reader = new BufferedReader(new FileReader(file)); Set<List<String>> paths = new HashSet<List<String>>(); while (reader.ready()) { String line = reader.readLine(); String[] path = line.split(" "); if (concepts.contains(path[path.length - 1])) paths.add(Arrays.asList(path)); } reader.close(); List<Document> docs = getRecords(paths); for (Document doc : docs) indexWriter.addDocument(doc); if ((counter % 1000) == 0) log.info("\tINDEXED ENDING PATHS FOR " + counter + " CONCEPTS SO FAR ... "); counter++; } log.info("\tINDEXED " + counter + " ENDING CONCEPTS, DONE!"); // closes the index writers indexWriter.optimize(); indexWriter.close(); }