private void indexFiles(String dir, String index, int featureIndex, boolean createNewIndex) throws IOException { ArrayList<String> images = FileUtils.getAllImages(new File(dir), true); IndexWriter iw = LuceneUtils.createIndexWriter( index, createNewIndex, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); // select one feature for the large index: int count = 0; long ms = System.currentTimeMillis(); DocumentBuilder builder = new ChainedDocumentBuilder(); ((ChainedDocumentBuilder) builder).addBuilder(builders[featureIndex]); // ((ChainedDocumentBuilder) builder).addBuilder(builders[0]); for (Iterator<String> iterator = images.iterator(); iterator.hasNext(); ) { count++; if (count > 100 && count % 5000 == 0) { System.out.println( count + " files indexed. " + (System.currentTimeMillis() - ms) / (count) + " ms per file"); } String file = iterator.next(); try { iw.addDocument(builder.createDocument(new FileInputStream(file), file)); } catch (Exception e) { System.err.println("Error: " + e.getMessage()); } } iw.close(); }
private void indexFiles(ArrayList<String> images, DocumentBuilder builder, String indexPath) throws IOException { // eventually check if the directory is there or not ... IndexWriter iw = LuceneUtils.createIndexWriter(testIndex, false); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { // TODO: cut toes from the image ... -> doesn't work out very well. Stable at first, // decreasing then. // TODO: Joint Histogram ... // TODO: LSA / PCA on the vectors ...-> this looks like a job for me :-D // TODO: local features ... Document doc = null; if (cutImages) { BufferedImage bimg = ImageUtils.cropImage(ImageIO.read(new FileInputStream(identifier)), 0, 0, 200, 69); doc = builder.createDocument(bimg, identifier); } else doc = builder.createDocument(new FileInputStream(identifier), identifier); iw.addDocument(doc); count++; if (count % 100 == 0) { int percent = (int) Math.floor(((double) count * 100.0) / (double) images.size()); double timeTemp = (double) (System.currentTimeMillis() - time) / 1000d; int secsLeft = (int) Math.round(((timeTemp / (double) count) * (double) images.size()) - timeTemp); System.out.println(percent + "% finished (" + count + " files), " + secsLeft + " s left"); } } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.commit(); iw.close(); }
public void testSurfIndexing() throws IOException { ArrayList<String> images = FileUtils.getAllImages(new File(testExtensive), true); ChainedDocumentBuilder db = new ChainedDocumentBuilder(); db.addBuilder(new SurfDocumentBuilder()); IndexWriter iw = LuceneUtils.createIndexWriter("sift-idx", true); for (int i = 0; i < images.size(); i++) { // int sampleQuery = sampleQueries[i]; // String s = testExtensive + "/" + sampleQuery + ".jpg"; iw.addDocument(db.createDocument(new FileInputStream(images.get(i)), images.get(i))); if (i % 100 == 99) System.out.print("."); if (i % 1000 == 999) System.out.print(" ~ " + i + " files indexed\n"); if (i > 1000) break; } System.out.println(""); iw.close(); }
public double testIndexing() throws IOException, IllegalAccessException, InstantiationException { LocalitySensitiveHashing.generateHashFunctions(); LocalitySensitiveHashing.readHashFunctions(); DocumentBuilder builder = new ChainedDocumentBuilder(); ((ChainedDocumentBuilder) builder).addBuilder(DocumentBuilderFactory.getCEDDDocumentBuilder()); // System.out.println("-< Getting files to index >--------------"); ArrayList<String> images = FileUtils.getAllImages(new File(testExtensive), true); // System.out.println("-< Indexing " + images.size() + " files >--------------"); IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { CEDD cedd = new CEDD(); cedd.extract(ImageIO.read(new FileInputStream(identifier))); Document doc = new Document(); doc.add(new Field(DocumentBuilder.FIELD_NAME_CEDD, cedd.getByteArrayRepresentation())); doc.add( new Field( DocumentBuilder.FIELD_NAME_IDENTIFIER, identifier, Field.Store.YES, Field.Index.NOT_ANALYZED)); int[] hashes = LocalitySensitiveHashing.generateHashes(cedd.getDoubleHistogram()); StringBuilder hash = new StringBuilder(512); for (int i = 0; i < hashes.length; i++) { hash.append(hashes[i]); hash.append(' '); } // System.out.println("hash = " + hash); doc.add(new Field("hash", hash.toString(), Field.Store.YES, Field.Index.ANALYZED)); iw.addDocument(doc); count++; // if (count % 100 == 0) System.out.println(count + " files indexed."); } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; // System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.close(); return testSearch(); }
private Document indexFiles() throws IOException { System.out.println("---< indexing >-------------------------"); int count = 0; DocumentBuilder builder = getDocumentBuilder(); ArrayList<String> allImages = FileUtils.getAllImages(new File("wang-1000"), true); IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true); Document document = null; for (Iterator<String> iterator = allImages.iterator(); iterator.hasNext(); ) { String filename = iterator.next(); BufferedImage image = ImageIO.read(new FileInputStream(filename)); document = builder.createDocument(image, filename); iw.addDocument(document); count++; if (count % 50 == 0) System.out.println("finished " + (count * 100) / allImages.size() + "% of the images."); } iw.close(); return document; }
public void run() { // do it ... try { IndexWriter indexWriter = LuceneUtils.createIndexWriter( indexPath, overwriteIndex, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); for (Iterator<File> iterator = inputFiles.iterator(); iterator.hasNext(); ) { File inputFile = iterator.next(); if (verbose) System.out.println("Processing " + inputFile.getPath() + "."); if (verbose) System.out.println("Counting images."); run = 0; readFile(indexWriter, inputFile); if (verbose) System.out.printf("%d images found in the data file.\n", docCount); int numberOfRepresentatives = 1000; // TODO: clever selection. // select a number of representative "fixed stars" randomly from file if (numberOfRepresentatives > Math.sqrt(docCount)) numberOfRepresentatives = (int) Math.sqrt(docCount); if (verbose) System.out.printf( "Selecting %d representative images for hashing.\n", numberOfRepresentatives); representativesID = new HashSet<Integer>(numberOfRepresentatives); while (representativesID.size() < numberOfRepresentatives) { representativesID.add((int) Math.floor(Math.random() * (docCount - 1))); } representatives = new ArrayList<LireFeature>(numberOfRepresentatives); docCount = 0; run = 1; if (verbose) System.out.println("Now getting representatives from the data file."); readFile(indexWriter, inputFile); docCount = 0; run = 2; if (verbose) System.out.println("Finally we start the indexing process, please wait ..."); readFile(indexWriter, inputFile); if (verbose) System.out.println("Indexing finished."); } indexWriter.commit(); indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } }
public void testIndexLarge() throws IOException { // ArrayList<String> images = FileUtils.getAllImages(new // File("C:\\Temp\\testImagelogos"), true); ArrayList<String> images = FileUtils.getAllImages( new File("C:\\Java\\Projects\\LireSVN\\testdata\\flickr-10000"), false); IndexWriter iw = LuceneUtils.createIndexWriter( "index-large", true, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); // select one feature for the large index: int featureIndex = 13; int count = 0; long ms = System.currentTimeMillis(); DocumentBuilder builder = new ChainedDocumentBuilder(); ((ChainedDocumentBuilder) builder).addBuilder(builders[featureIndex]); // ((ChainedDocumentBuilder) builder).addBuilder(builders[0]); for (Iterator<String> iterator = images.iterator(); iterator.hasNext(); ) { count++; if (count > 100 && count % 500 == 0) { System.out.println( count + " files indexed. " + (System.currentTimeMillis() - ms) / (count) + " ms per file"); } String file = iterator.next(); try { // try to trim the image first .... // BufferedImage img = ImageUtils.trimWhiteSpace(ImageIO.read(new // FileInputStream(file))); // iw.addDocument(builder.createDocument(img, file)); iw.addDocument(builder.createDocument(new FileInputStream(file), file)); } catch (Exception e) { e .printStackTrace(); // To change body of catch statement use File | Settings | File // Templates. } } iw.close(); }
private void indexFiles(ArrayList<String> images, DocumentBuilder builder, String indexPath) throws IOException { // System.out.println(">> Indexing " + images.size() + " files."); // DocumentBuilder builder = DocumentBuilderFactory.getExtensiveDocumentBuilder(); // DocumentBuilder builder = DocumentBuilderFactory.getFastDocumentBuilder(); IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true); int count = 0; long time = System.currentTimeMillis(); for (String identifier : images) { Document doc = builder.createDocument(new FileInputStream(identifier), identifier); iw.addDocument(doc); count++; if (count % 100 == 0) System.out.println(count + " files indexed."); // if (count == 200) break; } long timeTaken = (System.currentTimeMillis() - time); float sec = ((float) timeTaken) / 1000f; System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image."); iw.commit(); iw.close(); }
public void testCreateAndSearchSmallIndex() throws IOException { for (int i = 0, buildersLength = builders.length; i < buildersLength; i++) { DocumentBuilder b = builders[i]; // create an index with a specific builder: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-small", true); for (String identifier : testFiles) { Document doc = b.createDocument(new FileInputStream(testFilesPath + identifier), identifier); doc.add(new StoredField("video_file", "surgery1.mp4")); doc.add(new StoredField("timestamp", "25")); iw.addDocument(doc); } iw.close(); ImageSearcher s = searchers[i]; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-small"))); for (int k = 0; k < reader.maxDoc(); k++) { Document query = reader.document(k); ImageSearchHits hits = s.search(query, reader); for (int y = 0; y < hits.length(); y++) { Document result = hits.doc(y); if (y == 0) { // check if the first result is the query: assertEquals( result.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0].equals( query.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), true); System.out.println(result.getValues("video_file")[0]); } else { // check if they are ordered by distance: assertEquals(hits.score(y) < hits.score(y - 1), true); } } } } }
/** * Creates a set of reference objects and stores it in a new index (name "<indexPath>-ro"). Then * creates ordered lists of reference object positions for each data item in the index with given * feature. Finally a new index (name "<indexPath>-ms") is created where all the original * documents as well as the new data are stored. * * @param indexPath the path to the original index * @throws IOException */ public void createIndex(String indexPath) throws IOException { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); int numDocs = reader.numDocs(); if (numDocs < numReferenceObjects) { throw new UnsupportedOperationException("Too few documents in index."); } // progress report progress.setNumDocsAll(numDocs); progress.setCurrentState(State.RoSelection); boolean hasDeletions = reader.hasDeletions(); // init reference objects: IndexWriter iw = LuceneUtils.createIndexWriter(indexPath + "-ro", true); HashSet<Integer> referenceObjsIds = new HashSet<Integer>(numReferenceObjects); double numDocsDouble = (double) numDocs; while (referenceObjsIds.size() < numReferenceObjects) { referenceObjsIds.add((int) (numDocsDouble * Math.random())); } int count = 0; if (hasDeletions) { System.err.println( "WARNING: There are deleted docs in your index. You should " + "optimize your index before using this method."); } // progress report progress.setCurrentState(State.RoIndexing); // find them in the index and put them into a separate index: for (int i : referenceObjsIds) { count++; Document document = reader.document(i); document.add(new Field("ro-id", count + "", StringField.TYPE_STORED)); iw.addDocument(document); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Indexing); // now find the reference objects for each entry ;) IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro"))); ImageSearcher searcher = new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName); Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>(); analyzerPerField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION)); PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper( new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), analyzerPerField); iw = new IndexWriter( FSDirectory.open(new File(indexPath)), new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper) .setOpenMode(IndexWriterConfig.OpenMode.CREATE)); StringBuilder sb = new StringBuilder(256); // Needed for check whether the document is deleted. Bits liveDocs = MultiFields.getLiveDocs(reader); for (int i = 0; i < numDocs; i++) { if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it. Document document = reader.document(i); ImageSearchHits hits = searcher.search(document, readerRo); sb.delete(0, sb.length()); for (int j = 0; j < numReferenceObjectsUsed; j++) { sb.append(hits.doc(j).getValues("ro-id")[0]); sb.append(' '); } // System.out.println(sb.toString()); document.add(new TextField("ro-order", sb.toString(), Field.Store.YES)); iw.updateDocument( new Term( DocumentBuilder.FIELD_NAME_IDENTIFIER, document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]), document); // progress report progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1); } iw.commit(); iw.close(); // progress report progress.setCurrentState(State.Idle); }