/** Return a unique char buffer representing a given character sequence. */ public char[] intern(MutableCharArray chs) { if (tokenCache.contains(chs)) { return tokenCache.lget().getBuffer(); } else { final char[] tokenImage = new char[chs.length()]; System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length()); tokenCache.add(new MutableCharArray(tokenImage)); return tokenImage; } }
@Test public void testBytes() throws Exception { List<Integer> docs = Arrays.asList(1, 5, 7); ObjectOpenHashSet<BytesRef> hTerms = new ObjectOpenHashSet<BytesRef>(); List<BytesRef> cTerms = new ArrayList<BytesRef>(docs.size()); for (int i = 0; i < docs.size(); i++) { BytesRef term = new BytesRef("str" + docs.get(i)); hTerms.add(term); cTerms.add(term); } FieldDataTermsFilter hFilter = FieldDataTermsFilter.newBytes(getFieldData(strMapper), hTerms); int size = reader.maxDoc(); FixedBitSet result = new FixedBitSet(size); result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } // filter from mapper result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); result.or( strMapper .termsFilter(ifdService, cTerms, null) .getDocIdSet(reader.getContext(), reader.getLiveDocs()) .iterator()); assertThat(result.cardinality(), equalTo(docs.size())); for (int i = 0; i < reader.maxDoc(); i++) { assertThat(result.get(i), equalTo(docs.contains(i))); } result.clear(0, size); assertThat(result.cardinality(), equalTo(0)); // filter on a numeric field using BytesRef terms // should not match any docs hFilter = FieldDataTermsFilter.newBytes(getFieldData(lngMapper), hTerms); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(0)); // filter on a numeric field using BytesRef terms // should not match any docs hFilter = FieldDataTermsFilter.newBytes(getFieldData(dblMapper), hTerms); result.or(hFilter.getDocIdSet(reader.getContext(), reader.getLiveDocs()).iterator()); assertThat(result.cardinality(), equalTo(0)); }
private Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) { Map<DiscoveryNode, TransportNodesListShardStoreMetaData.StoreFilesMetaData> shardStores = cachedStores.get(shard.shardId()); ObjectOpenHashSet<String> nodesIds; if (shardStores == null) { shardStores = Maps.newHashMap(); cachedStores.put(shard.shardId(), shardStores); nodesIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { nodesIds = ObjectOpenHashSet.newInstance(); // clean nodes that have failed for (Iterator<DiscoveryNode> it = shardStores.keySet().iterator(); it.hasNext(); ) { DiscoveryNode node = it.next(); if (!nodes.nodeExists(node.id())) { it.remove(); } } for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStores.containsKey(node)) { nodesIds.add(node.id()); } } } if (!nodesIds.isEmpty()) { String[] nodesIdsArray = nodesIds.toArray(String.class); TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData .list(shard.shardId(), false, nodesIdsArray, listTimeout) .actionGet(); if (logger.isTraceEnabled()) { if (nodesStoreFilesMetaData.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:"); for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ") .append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage()); } logger.trace(sb.toString()); } } for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) { if (nodeStoreFilesMetaData.storeFilesMetaData() != null) { shardStores.put( nodeStoreFilesMetaData.getNode(), nodeStoreFilesMetaData.storeFilesMetaData()); } } } return shardStores; }
public static void main(String[] args) throws Exception { Natives.tryMlockall(); Settings settings = settingsBuilder() .put("refresh_interval", "-1") .put(SETTING_NUMBER_OF_SHARDS, 1) .put(SETTING_NUMBER_OF_REPLICAS, 0) .build(); String clusterName = TermsAggregationSearchAndIndexingBenchmark.class.getSimpleName(); nodes = new InternalNode[1]; for (int i = 0; i < nodes.length; i++) { nodes[i] = (InternalNode) nodeBuilder() .settings(settingsBuilder().put(settings).put("name", "node1")) .clusterName(clusterName) .node(); } Client client = nodes[0].client(); client .admin() .cluster() .prepareHealth(indexName) .setWaitForGreenStatus() .setTimeout("10s") .execute() .actionGet(); try { client .admin() .indices() .prepareCreate(indexName) .addMapping(typeName, generateMapping("eager", "lazy")) .get(); Thread.sleep(5000); long startTime = System.currentTimeMillis(); ObjectOpenHashSet<String> uniqueTerms = ObjectOpenHashSet.newInstance(); for (int i = 0; i < NUMBER_OF_TERMS; i++) { boolean added; do { added = uniqueTerms.add(RandomStrings.randomAsciiOfLength(random, STRING_TERM_SIZE)); } while (!added); } String[] sValues = uniqueTerms.toArray(String.class); long ITERS = COUNT / BATCH; long i = 1; int counter = 0; for (; i <= ITERS; i++) { BulkRequestBuilder request = client.prepareBulk(); for (int j = 0; j < BATCH; j++) { counter++; XContentBuilder builder = jsonBuilder().startObject(); builder.field("id", Integer.toString(counter)); final String sValue = sValues[counter % sValues.length]; builder.field("s_value", sValue); builder.field("s_value_dv", sValue); for (String field : new String[] {"sm_value", "sm_value_dv"}) { builder.startArray(field); for (int k = 0; k < NUMBER_OF_MULTI_VALUE_TERMS; k++) { builder.value(sValues[ThreadLocalRandom.current().nextInt(sValues.length)]); } builder.endArray(); } request.add( Requests.indexRequest(indexName) .type("type1") .id(Integer.toString(counter)) .source(builder)); } BulkResponse response = request.execute().actionGet(); if (response.hasFailures()) { System.err.println("--> failures..."); } if (((i * BATCH) % 10000) == 0) { System.out.println("--> Indexed " + (i * BATCH)); } } System.out.println( "--> Indexing took " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds."); } catch (IndexAlreadyExistsException e) { System.out.println("--> Index already exists, ignoring indexing phase, waiting for green"); ClusterHealthResponse clusterHealthResponse = client .admin() .cluster() .prepareHealth(indexName) .setWaitForGreenStatus() .setTimeout("10m") .execute() .actionGet(); if (clusterHealthResponse.isTimedOut()) { System.err.println("--> Timed out waiting for cluster health"); } } client .admin() .indices() .preparePutMapping(indexName) .setType(typeName) .setSource(generateMapping("lazy", "lazy")) .get(); client.admin().indices().prepareRefresh().execute().actionGet(); System.out.println( "--> Number of docs in index: " + client.prepareCount().setQuery(matchAllQuery()).execute().actionGet().getCount()); String[] nodeIds = new String[nodes.length]; for (int i = 0; i < nodeIds.length; i++) { nodeIds[i] = nodes[i].injector().getInstance(Discovery.class).localNode().getId(); } List<TestRun> testRuns = new ArrayList<>(); testRuns.add(new TestRun("Regular field ordinals", "eager", "lazy", "s_value", "ordinals")); testRuns.add( new TestRun("Docvalues field ordinals", "lazy", "eager", "s_value_dv", "ordinals")); testRuns.add( new TestRun( "Regular field global ordinals", "eager_global_ordinals", "lazy", "s_value", null)); testRuns.add( new TestRun("Docvalues field global", "lazy", "eager_global_ordinals", "s_value_dv", null)); List<TestResult> testResults = new ArrayList<>(); for (TestRun testRun : testRuns) { client .admin() .indices() .preparePutMapping(indexName) .setType(typeName) .setSource( generateMapping(testRun.indexedFieldEagerLoading, testRun.docValuesEagerLoading)) .get(); client.admin().indices().prepareClearCache(indexName).setFieldDataCache(true).get(); SearchThread searchThread = new SearchThread(client, testRun.termsAggsField, testRun.termsAggsExecutionHint); RefreshThread refreshThread = new RefreshThread(client); System.out.println("--> Running '" + testRun.name + "' round..."); new Thread(refreshThread).start(); new Thread(searchThread).start(); Thread.sleep(2 * 60 * 1000); refreshThread.stop(); searchThread.stop(); System.out.println("--> Avg refresh time: " + refreshThread.avgRefreshTime + " ms"); System.out.println("--> Avg query time: " + searchThread.avgQueryTime + " ms"); ClusterStatsResponse clusterStateResponse = client.admin().cluster().prepareClusterStats().setNodesIds(nodeIds).get(); System.out.println( "--> Heap used: " + clusterStateResponse.getNodesStats().getJvm().getHeapUsed()); ByteSizeValue fieldDataMemoryUsed = clusterStateResponse.getIndicesStats().getFieldData().getMemorySize(); System.out.println("--> Fielddata memory size: " + fieldDataMemoryUsed); testResults.add( new TestResult( testRun.name, refreshThread.avgRefreshTime, searchThread.avgQueryTime, fieldDataMemoryUsed)); } System.out.println( "----------------------------------------- SUMMARY ----------------------------------------------"); System.out.format( Locale.ENGLISH, "%30s%18s%15s%15s\n", "name", "avg refresh time", "avg query time", "fieldata size"); for (TestResult testResult : testResults) { System.out.format( Locale.ENGLISH, "%30s%18s%15s%15s\n", testResult.name, testResult.avgRefreshTime, testResult.avgQueryTime, testResult.fieldDataSizeInMemory); } System.out.println( "----------------------------------------- SUMMARY ----------------------------------------------"); client.close(); for (InternalNode node : nodes) { node.close(); } }
/** * Document preprocessing context provides low-level (usually integer-coded) data structures useful * for further processing. * * <p><img src="doc-files/preprocessing-arrays.png" alt="Internals of PreprocessingContext"/> */ public final class PreprocessingContext { /** Uninitialized structure constant. */ private static final String UNINITIALIZED = "[uninitialized]"; /** Query used to perform processing, may be <code>null</code> */ public final String query; /** A list of documents to process. */ public final List<Document> documents; /** Language model to be used */ public final LanguageModel language; /** * Token interning cache. Token images are interned to save memory and allow reference * comparisons. */ private ObjectOpenHashSet<MutableCharArray> tokenCache = ObjectOpenHashSet.newInstance(); /** * Creates a preprocessing context for the provided <code>documents</code> and with the provided * <code>languageModel</code>. */ public PreprocessingContext(LanguageModel languageModel, List<Document> documents, String query) { this.query = query; this.documents = documents; this.language = languageModel; } /** * Information about all tokens of the input {@link PreprocessingContext#documents}. Each element * of each of the arrays corresponds to one individual token from the input or a synthetic * separator inserted between documents, fields and sentences. Last element of this array is a * special terminator entry. * * <p>All arrays in this class have the same length and values across different arrays correspond * to each other for the same index. */ public class AllTokens { /** * Token image as it appears in the input. On positions where {@link #type} is equal to one of * {@link ITokenizer#TF_TERMINATOR}, {@link ITokenizer#TF_SEPARATOR_DOCUMENT} or {@link * ITokenizer#TF_SEPARATOR_FIELD} , image is <code>null</code>. * * <p>This array is produced by {@link Tokenizer}. */ public char[][] image; /** * Token's {@link ITokenizer} bit flags. * * <p>This array is produced by {@link Tokenizer}. */ public short[] type; /** * Document field the token came from. The index points to arrays in {@link AllFields}, equal to * <code>-1</code> for document and field separators. * * <p>This array is produced by {@link Tokenizer}. */ public byte[] fieldIndex; /** * Index of the document this token came from, points to elements of {@link * PreprocessingContext#documents}. Equal to <code>-1</code> for document separators. * * <p>This array is produced by {@link Tokenizer}. * * <p>This array is accessed in in {@link CaseNormalizer} and {@link PhraseExtractor} to compute * by-document statistics, e.g. tf-by document, which are then needed to build a VSM or assign * documents to labels. An alternative to this representation would be creating an <code> * AllDocuments</code> holder and keep there an array of start token indexes for each document * and then refactor the model building code to do a binary search to determine the document * index given token index. This is likely to be a significant performance hit because model * building code accesses the documentIndex array pretty much randomly (in the suffix order), so * we'd be doing twice-the-number-of-tokens binary searches. Unless there's some other data * structure that can help us here. */ public int[] documentIndex; /** * A pointer to {@link AllWords} arrays for this token. Equal to <code>-1</code> for document, * field and {@link ITokenizer#TT_PUNCTUATION} tokens (including sentence separators). * * <p>This array is produced by {@link CaseNormalizer}. */ public int[] wordIndex; /** * The suffix order of tokens. Suffixes starting with a separator come at the end of the array. * * <p>This array is produced by {@link PhraseExtractor}. */ public int[] suffixOrder; /** * The Longest Common Prefix for the adjacent suffix-sorted token sequences. * * <p>This array is produced by {@link PhraseExtractor}. */ public int[] lcp; /** For debugging purposes. */ @Override public String toString() { StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("token").alignLeft(); t.addColumn("type"); t.addColumn("fieldIndex"); t.addColumn("=>field").alignLeft(); t.addColumn("docIdx"); t.addColumn("wordIdx"); t.addColumn("=>word").alignLeft(); for (int i = 0; i < image.length; i++, t.nextRow()) { t.rowData( i, image[i] == null ? "<null>" : new String(image[i]), type[i], fieldIndex[i], fieldIndex[i] >= 0 ? allFields.name[fieldIndex[i]] : null, documentIndex[i], wordIndex[i], wordIndex[i] >= 0 ? new String(allWords.image[wordIndex[i]]) : null); } if (suffixOrder != null) { t = new TabularOutput(sw); t.addColumn("#"); t.addColumn("sa"); t.addColumn("lcp"); t.addColumn("=>words").alignLeft(); sw.append("\n"); final StringBuilder suffixImage = new StringBuilder(); for (int i = 0; i < suffixOrder.length; i++, t.nextRow()) { t.rowData(i, suffixOrder[i], lcp[i]); int windowLength = 5; for (int j = suffixOrder[i], max = Math.min(suffixOrder[i] + windowLength, wordIndex.length); j < max; ) { suffixImage .append(wordIndex[j] >= 0 ? new String(allWords.image[wordIndex[j]]) : "|") .append(" "); if (++j == max && j != wordIndex.length) suffixImage.append(" [...]"); } t.rowData(suffixImage.toString()); suffixImage.setLength(0); } sw.append("\n"); } t.flush(); sw.append("\n"); return sw.toString(); } } /** Information about all tokens of the input {@link PreprocessingContext#documents}. */ public final AllTokens allTokens = new AllTokens(); /** * Information about all fields processed for the input {@link PreprocessingContext#documents}. */ public static class AllFields { /** * Name of the document field. Entries of {@link AllTokens#fieldIndex} point to this array. * * <p>This array is produced by {@link Tokenizer}. */ public String[] name; /** For debugging purposes. */ @Override public String toString() { StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("name").format("%-10s").alignLeft(); int i = 0; for (String n : name) { t.rowData(i++, n).nextRow(); } t.flush(); sw.append("\n"); return sw.toString(); } } /** * Information about all fields processed for the input {@link PreprocessingContext#documents}. */ public final AllFields allFields = new AllFields(); /** * Information about all unique words found in the input {@link PreprocessingContext#documents}. * An entry in each parallel array corresponds to one conflated form of a word. For example, * <em>data</em> and <em>DATA</em> will most likely become a single entry in the words table. * However, different grammatical forms of a single lemma (like <em>computer</em> and * <em>computers</em>) will have different entries in the words table. See {@link AllStems} for * inflection-conflated versions. * * <p>All arrays in this class have the same length and values across different arrays correspond * to each other for the same index. */ public class AllWords { /** * The most frequently appearing variant of the word with respect to case. E.g. if a token * <em>MacOS</em> appeared 12 times in the input and <em>macos</em> appeared 3 times, the image * will be equal to <em>MacOS</em>. * * <p>This array is produced by {@link CaseNormalizer}. */ public char[][] image; /** * Token type of this word copied from {@link AllTokens#type}. Additional flags are set for each * word by {@link CaseNormalizer} and {@link LanguageModelStemmer}. * * <p>This array is produced by {@link CaseNormalizer}. This array is modified by {@link * LanguageModelStemmer}. * * @see ITokenizer */ public short[] type; /** * Term Frequency of the word, aggregated across all variants with respect to case. Frequencies * for each variant separately are not available. * * <p>This array is produced by {@link CaseNormalizer}. */ public int[] tf; /** * Term Frequency of the word for each document. The length of this array is equal to the number * of documents this word appeared in (Document Frequency) multiplied by 2. Elements at even * indices contain document indices pointing to {@link PreprocessingContext#documents}, elements * at odd indices contain the frequency of the word in the document. For example, an array with * 4 values: <code>[2, 15, 138, 7]</code> means that the word appeared 15 times in document at * index 2 and 7 times in document at index 138. * * <p>This array is produced by {@link CaseNormalizer}. The order of documents in this array is * not defined. */ public int[][] tfByDocument; /** * A pointer to the {@link AllStems} arrays for this word. * * <p>This array is produced by {@link LanguageModelStemmer}. */ public int[] stemIndex; /** * A bit-packed indices of all fields in which this word appears at least once. Indexes * (positions) of selected bits are pointers to the {@link AllFields} arrays. Fast conversion * between the bit-packed representation and <code>byte[]</code> with index values is done by * {@link #toFieldIndexes(byte)} * * <p>This array is produced by {@link CaseNormalizer}. */ public byte[] fieldIndices; /** For debugging purposes. */ @Override public String toString() { StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("image").alignLeft(); t.addColumn("type"); t.addColumn("tf"); t.addColumn("tfByDocument").alignLeft(); t.addColumn("fieldIndices"); if (stemIndex != null) { t.addColumn("stemIndex"); t.addColumn("=>stem").alignLeft(); } for (int i = 0; i < image.length; i++, t.nextRow()) { t.rowData( i, image[i] == null ? "<null>" : new String(image[i]), type[i], tf[i], SparseArray.sparseToString(tfByDocument[i])); t.rowData(Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", "")); if (stemIndex != null) { t.rowData(stemIndex[i]); t.rowData(new String(allStems.image[stemIndex[i]])); } } t.flush(); sw.append("\n"); return sw.toString(); } } /** * Information about all unique words found in the input {@link PreprocessingContext#documents}. */ public final AllWords allWords = new AllWords(); /** * Information about all unique stems found in the input {@link PreprocessingContext#documents}. * Each entry in each array corresponds to one base form different words can be transformed to by * the {@link IStemmer} used while processing. E.g. the English <em>mining</em> and <em>mine</em> * will be aggregated to one entry in the arrays, while they will have separate entries in {@link * AllWords}. * * <p>All arrays in this class have the same length and values across different arrays correspond * to each other for the same index. */ public class AllStems { /** * Stem image as produced by the {@link IStemmer}, may not correspond to any correct word. * * <p>This array is produced by {@link LanguageModelStemmer}. */ public char[][] image; /** * Pointer to the {@link AllWords} arrays, to the most frequent original form of the stem. * Pointers to the less frequent variants are not available. * * <p>This array is produced by {@link LanguageModelStemmer}. */ public int[] mostFrequentOriginalWordIndex; /** * Term frequency of the stem, i.e. the sum of all {@link AllWords#tf} values for which the * {@link AllWords#stemIndex} points to this stem. * * <p>This array is produced by {@link LanguageModelStemmer}. */ public int[] tf; /** * Term frequency of the stem for each document. For the encoding of this array, see {@link * AllWords#tfByDocument}. * * <p>This array is produced by {@link LanguageModelStemmer}. The order of documents in this * array is not defined. */ public int[][] tfByDocument; /** * A bit-packed indices of all fields in which this word appears at least once. Indexes * (positions) of selected bits are pointers to the {@link AllFields} arrays. Fast conversion * between the bit-packed representation and <code>byte[]</code> with index values is done by * {@link #toFieldIndexes(byte)} * * <p>This array is produced by {@link LanguageModelStemmer} */ public byte[] fieldIndices; /** For debugging purposes. */ @Override public String toString() { StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("stem"); t.addColumn("mostFrqWord"); t.addColumn("=>mostFrqWord").alignLeft(); t.addColumn("tf"); t.addColumn("tfByDocument").alignLeft(); t.addColumn("fieldIndices"); for (int i = 0; i < image.length; i++, t.nextRow()) { t.rowData( i, image[i] == null ? "<null>" : new String(image[i]), mostFrequentOriginalWordIndex[i], new String(allWords.image[mostFrequentOriginalWordIndex[i]]), tf[i], SparseArray.sparseToString(tfByDocument[i]), Arrays.toString(toFieldIndexes(fieldIndices[i])).replace(" ", "")); } t.flush(); sw.append("\n"); return sw.toString(); } } /** * Information about all unique stems found in the input {@link PreprocessingContext#documents}. */ public final AllStems allStems = new AllStems(); /** * Information about all frequently appearing sequences of words found in the input {@link * PreprocessingContext#documents}. Each entry in each array corresponds to one sequence. * * <p>All arrays in this class have the same length and values across different arrays correspond * to each other for the same index. */ public class AllPhrases { /** * Pointers to {@link AllWords} for each word in the phrase sequence. * * <p>This array is produced by {@link PhraseExtractor}. */ public int[][] wordIndices; /** * Term frequency of the phrase. * * <p>This array is produced by {@link PhraseExtractor}. */ public int[] tf; /** * Term frequency of the phrase for each document. The encoding of this array is similar to * {@link AllWords#tfByDocument}: consecutive pairs of: document index, frequency. * * <p>This array is produced by {@link PhraseExtractor}. The order of documents in this array is * not defined. */ public int[][] tfByDocument; /** For debugging purposes. */ @Override public String toString() { if (wordIndices == null) { return ""; } StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("wordIndices"); t.addColumn("=>words").alignLeft(); t.addColumn("tf"); t.addColumn("tfByDocument").alignLeft(); for (int i = 0; i < wordIndices.length; i++, t.nextRow()) { t.rowData( i, Arrays.toString(wordIndices[i]).replace(" ", ""), getPhrase(i), tf[i], SparseArray.sparseToString(tfByDocument[i])); } t.flush(); sw.append("\n"); return sw.toString(); } /** Returns space-separated words that constitute this phrase. */ public CharSequence getPhrase(int index) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < wordIndices[index].length; i++) { if (i > 0) sb.append(" "); sb.append(new String(allWords.image[wordIndices[index][i]])); } return sb; } /** Returns length of all arrays in this {@link AllPhrases}. */ public int size() { return wordIndices.length; } } /** * Information about all frequently appearing sequences of words found in the input {@link * PreprocessingContext#documents}. */ public AllPhrases allPhrases = new AllPhrases(); /** * Information about words and phrases that might be good cluster label candidates. Each entry in * each array corresponds to one label candidate. * * <p>All arrays in this class have the same length and values across different arrays correspond * to each other for the same index. */ public class AllLabels { /** * Feature index of the label candidate. Features whose values are less than the size of {@link * AllWords} arrays are single word features and point to entries in {@link AllWords}. Features * whose values are larger or equal to the size of {@link AllWords}, after subtracting the size * of {@link AllWords}, point to {@link AllPhrases}. * * <p>This array is produced by {@link LabelFilterProcessor}. */ public int[] featureIndex; /** * Indices of documents assigned to the label candidate. * * <p>This array is produced by {@link DocumentAssigner}. */ public BitSet[] documentIndices; /** * The first index in {@link #featureIndex} which points to {@link AllPhrases}, or -1 if there * are no phrases in {@link #featureIndex}. * * <p>This value is set by {@link LabelFilterProcessor}. * * @see #featureIndex */ public int firstPhraseIndex; /** For debugging purposes. */ @Override public String toString() { if (featureIndex == null) return UNINITIALIZED; StringWriter sw = new StringWriter(); TabularOutput t = new TabularOutput(sw); t.flushEvery(Integer.MAX_VALUE); t.addColumn("#"); t.addColumn("featureIdx"); t.addColumn("=>feature").alignLeft(); t.addColumn("documentIdx").alignLeft(); for (int i = 0; i < featureIndex.length; i++, t.nextRow()) { t.rowData( i, featureIndex[i], getLabel(i), documentIndices != null ? documentIndices[i].toString().replace(" ", "") : ""); } t.flush(); sw.append("\n"); return t.toString(); } private CharSequence getLabel(int index) { final int wordsSize = allWords.image.length; if (featureIndex[index] < wordsSize) return new String(allWords.image[featureIndex[index]]); else return allPhrases.getPhrase(featureIndex[index] - wordsSize); } } /** Information about words and phrases that might be good cluster label candidates. */ public final AllLabels allLabels = new AllLabels(); /** Returns <code>true</code> if this context contains any words. */ public boolean hasWords() { return allWords.image.length > 0; } /** Returns <code>true</code> if this context contains any label candidates. */ public boolean hasLabels() { return allLabels.featureIndex != null && allLabels.featureIndex.length > 0; } @Override public String toString() { return "PreprocessingContext 0x" + Integer.toHexString(this.hashCode()) + "\n" + "== Fields:\n" + this.allFields.toString() + "== Tokens:\n" + this.allTokens.toString() + "== Words:\n" + this.allWords.toString() + "== Stems:\n" + this.allStems.toString() + "== Phrases:\n" + this.allPhrases.toString() + "== Labels:\n" + this.allLabels.toString(); } /** Static conversion between selected bits and an array of indexes of these bits. */ private static final int[][] bitsCache; static { bitsCache = new int[0x100][]; for (int i = 0; i < 0x100; i++) { bitsCache[i] = new int[Integer.bitCount(i & 0xFF)]; for (int v = 0, bit = 0, j = i & 0xff; j != 0; j >>>= 1, bit++) { if ((j & 0x1) != 0) bitsCache[i][v++] = bit; } } } /** Convert the selected bits in a byte to an array of indexes. */ public static int[] toFieldIndexes(byte b) { return bitsCache[b & 0xff]; } /* * These should really be package-private, shouldn't they? We'd need to move classes under pipeline. * here for accessibility. */ /** * This method should be invoked after all preprocessing contributors have been executed to * release temporary data structures. */ public void preprocessingFinished() { this.tokenCache = null; } /** Return a unique char buffer representing a given character sequence. */ public char[] intern(MutableCharArray chs) { if (tokenCache.contains(chs)) { return tokenCache.lget().getBuffer(); } else { final char[] tokenImage = new char[chs.length()]; System.arraycopy(chs.getBuffer(), chs.getStart(), tokenImage, 0, chs.length()); tokenCache.add(new MutableCharArray(tokenImage)); return tokenImage; } } }
private ObjectLongOpenHashMap<DiscoveryNode> buildShardStates( final DiscoveryNodes nodes, MutableShardRouting shard) { ObjectLongOpenHashMap<DiscoveryNode> shardStates = cachedShardsState.get(shard.shardId()); ObjectOpenHashSet<String> nodeIds; if (shardStates == null) { shardStates = new ObjectLongOpenHashMap<>(); cachedShardsState.put(shard.shardId(), shardStates); nodeIds = ObjectOpenHashSet.from(nodes.dataNodes().keys()); } else { // clean nodes that have failed shardStates .keys() .removeAll( new ObjectPredicate<DiscoveryNode>() { @Override public boolean apply(DiscoveryNode node) { return !nodes.nodeExists(node.id()); } }); nodeIds = ObjectOpenHashSet.newInstance(); // we have stored cached from before, see if the nodes changed, if they have, go fetch again for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) { DiscoveryNode node = cursor.value; if (!shardStates.containsKey(node)) { nodeIds.add(node.id()); } } } if (nodeIds.isEmpty()) { return shardStates; } String[] nodesIdsArray = nodeIds.toArray(String.class); TransportNodesListGatewayStartedShards.NodesGatewayStartedShards response = listGatewayStartedShards.list(shard.shardId(), nodesIdsArray, listTimeout).actionGet(); if (logger.isDebugEnabled()) { if (response.failures().length > 0) { StringBuilder sb = new StringBuilder(shard + ": failures when trying to list shards on nodes:"); for (int i = 0; i < response.failures().length; i++) { Throwable cause = ExceptionsHelper.unwrapCause(response.failures()[i]); if (cause instanceof ConnectTransportException) { continue; } sb.append("\n -> ").append(response.failures()[i].getDetailedMessage()); } logger.debug(sb.toString()); } } for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : response) { // -1 version means it does not exists, which is what the API returns, and what we expect to logger.trace( "[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), nodeShardState.version()); shardStates.put(nodeShardState.getNode(), nodeShardState.version()); } return shardStates; }