/** * Overwrite this method if you want to filter the input, apply hashing, etc. * * @param feature the current feature. * @param document the current document. * @param featureFieldName the field hashFunctionsFileName of the feature. */ protected void addToDocument(LireFeature feature, Document document, String featureFieldName) { if (run == 0) { } // just count documents else if (run == 1) { // Select the representatives ... if (representativesID.contains(docCount) && feature .getClass() .getCanonicalName() .equals(featureClass.getCanonicalName())) { // it's a representative. // put it into a temporary data structure ... representatives.add(feature); } } else if (run == 2) { // actual hashing: find the nearest representatives and put those as a hash into a // document. if (feature .getClass() .getCanonicalName() .equals(featureClass.getCanonicalName())) { // it's a feature to be hashed int[] hashes = getHashes(feature); document.add( new TextField( featureFieldName + "_hash", createDocumentString(hashes, hashes.length), Field.Store.YES)); document.add( new TextField( featureFieldName + "_hash_q", createDocumentString(hashes, 10), Field.Store.YES)); } document.add(new StoredField(featureFieldName, feature.getByteArrayRepresentation())); } }
public void run() { // do it ... try { IndexWriter indexWriter = LuceneUtils.createIndexWriter( indexPath, overwriteIndex, LuceneUtils.AnalyzerType.WhitespaceAnalyzer); for (Iterator<File> iterator = inputFiles.iterator(); iterator.hasNext(); ) { File inputFile = iterator.next(); if (verbose) System.out.println("Processing " + inputFile.getPath() + "."); if (verbose) System.out.println("Counting images."); run = 0; readFile(indexWriter, inputFile); if (verbose) System.out.printf("%d images found in the data file.\n", docCount); int numberOfRepresentatives = 1000; // TODO: clever selection. // select a number of representative "fixed stars" randomly from file if (numberOfRepresentatives > Math.sqrt(docCount)) numberOfRepresentatives = (int) Math.sqrt(docCount); if (verbose) System.out.printf( "Selecting %d representative images for hashing.\n", numberOfRepresentatives); representativesID = new HashSet<Integer>(numberOfRepresentatives); while (representativesID.size() < numberOfRepresentatives) { representativesID.add((int) Math.floor(Math.random() * (docCount - 1))); } representatives = new ArrayList<LireFeature>(numberOfRepresentatives); docCount = 0; run = 1; if (verbose) System.out.println("Now getting representatives from the data file."); readFile(indexWriter, inputFile); docCount = 0; run = 2; if (verbose) System.out.println("Finally we start the indexing process, please wait ..."); readFile(indexWriter, inputFile); if (verbose) System.out.println("Indexing finished."); } indexWriter.commit(); indexWriter.close(); } catch (Exception e) { e.printStackTrace(); } }
protected void validateResponse( TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException { TestDoc testDoc = testConfig.doc; HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(Arrays.asList(testConfig.selectedFields)); Fields esTermVectorFields = esResponse.getFields(); for (TestFieldSetting field : testDoc.fieldSettings) { Terms esTerms = esTermVectorFields.terms(field.name); if (selectedFields != null && !selectedFields.contains(field.name)) { assertNull(esTerms); continue; } assertNotNull(esTerms); Terms luceneTerms = luceneFields.terms(field.name); TermsEnum esTermEnum = esTerms.iterator(null); TermsEnum luceneTermEnum = luceneTerms.iterator(null); while (esTermEnum.next() != null) { assertNotNull(luceneTermEnum.next()); assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq())); DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0); DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0); if (luceneDocsPosEnum == null) { // test we expect that... assertFalse(field.storedOffset); assertFalse(field.storedPayloads); assertFalse(field.storedPositions); continue; } String currentTerm = esTermEnum.term().utf8ToString(); assertThat( "Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString())); esDocsPosEnum.nextDoc(); luceneDocsPosEnum.nextDoc(); int freq = esDocsPosEnum.freq(); assertThat(freq, equalTo(luceneDocsPosEnum.freq())); for (int i = 0; i < freq; i++) { String failDesc = " (field:" + field.name + " term:" + currentTerm + ")"; int lucenePos = luceneDocsPosEnum.nextPosition(); int esPos = esDocsPosEnum.nextPosition(); if (field.storedPositions && testConfig.requestPositions) { assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos)); } else { assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1)); } if (field.storedOffset && testConfig.requestOffsets) { assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset())); assertThat( "Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset())); } else { assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1)); assertThat( "Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1)); } if (field.storedPayloads && testConfig.requestPayloads) { assertThat( "Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload())); } else { assertThat( "Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null)); } } } assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next()); } }