/** * Creates the {@link Document} for {@link KnowledgeBasePathIndex} * * @param txtFileDir where the path txt files are * @param indexDir the directory where the index is created * @throws IOException */ public static void createIndex(File txtFileDir, File indexDir) throws IOException { if (!indexDir.exists()) indexDir.mkdirs(); else if (!Files.isDirectoryEmpty(indexDir)) { log.warn(indexDir + " IS NOT EMPTY, BAILING OUT ..."); return; } IndexWriter indexWriter = new IndexWriter( new SimpleFSDirectory(indexDir), new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED); int counter = 0; FilenameFilter filter = new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); } }; Collection<File> files = Files.listFiles(txtFileDir, filter, true); for (File file : files) { BufferedReader reader = new BufferedReader(new FileReader(file)); Set<List<String>> paths = new HashSet<List<String>>(); while (reader.ready()) { String line = reader.readLine(); paths.add(Arrays.asList(line.split(" "))); } reader.close(); List<Document> docs = getRecords(paths); for (Document doc : docs) indexWriter.addDocument(doc); if ((counter % 1000) == 0) log.info("\tINDEXED PATHS FOR " + counter + " CONCEPTS SO FAR ... "); counter++; } log.info("\tINDEXED " + counter + " CONCEPTS, DONE!"); // closes the index writers indexWriter.optimize(); indexWriter.close(); }
/** * Creates the {@link Document} for {@link KnowledgeBasePathIndex} for a bunch of input concepts * only * * @param kb * @throws IOException */ public static void createIndex(KnowledgeBase kb, Collection<String> concepts) throws IOException { File txtFileDir = new File( KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + TXT_FILE_DIR); // init the index dirs and the index writer File indexDir = new File(KnowledgeConfiguration.getInstance().getKnowledgeBasePathIndexDir(kb) + INDEX_DIR); if (!indexDir.exists()) indexDir.mkdirs(); else if (!Files.isDirectoryEmpty(indexDir)) { log.warn(indexDir + " IS NOT EMPTY, BAILING OUT ..."); return; } IndexWriter indexWriter = new IndexWriter( new SimpleFSDirectory(indexDir), new StandardAnalyzer(Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED); int counter = 0; FilenameFilter filter = new FilenameFilter() { public boolean accept(File dir, String name) { return name.endsWith(".txt"); } }; Collection<File> files = Files.listFiles(txtFileDir, filter, true); // the intermediate concepts @depth 2 from the starting nodes Set<String> intermediate = new HashSet<String>(); // PHASE 1. index all paths starting with concepts of interest for (File file : files) { String fileName = file.getName(); String concept = fileName.substring(0, fileName.indexOf(".")); if (!concepts.contains(concept)) continue; BufferedReader reader = new BufferedReader(new FileReader(file)); Set<List<String>> paths = new HashSet<List<String>>(); while (reader.ready()) { String line = reader.readLine(); String[] path = line.split(" "); paths.add(Arrays.asList(path)); intermediate.add(path[path.length - 1]); } reader.close(); List<Document> docs = getRecords(paths); for (Document doc : docs) indexWriter.addDocument(doc); if ((counter % 1000) == 0) log.info("\tINDEXED STARTING PATHS FOR " + counter + " CONCEPTS SO FAR ... "); counter++; } log.info("\tINDEXED " + counter + " STARTING CONCEPTS, DONE!"); log.info("NOW INDEXING ENDING PATHS FOR " + intermediate.size() + " INTERMEDIATE CONCEPTS"); // PHASE 2. index all paths ending with concepts of interest counter = 0; for (File file : files) { String fileName = file.getName(); String concept = fileName.substring(0, fileName.indexOf(".")); if (concepts.contains(concept)) continue; // already seen if (!intermediate.contains(concept)) continue; // interested only // in intermediate BufferedReader reader = new BufferedReader(new FileReader(file)); Set<List<String>> paths = new HashSet<List<String>>(); while (reader.ready()) { String line = reader.readLine(); String[] path = line.split(" "); if (concepts.contains(path[path.length - 1])) paths.add(Arrays.asList(path)); } reader.close(); List<Document> docs = getRecords(paths); for (Document doc : docs) indexWriter.addDocument(doc); if ((counter % 1000) == 0) log.info("\tINDEXED ENDING PATHS FOR " + counter + " CONCEPTS SO FAR ... "); counter++; } log.info("\tINDEXED " + counter + " ENDING CONCEPTS, DONE!"); // closes the index writers indexWriter.optimize(); indexWriter.close(); }