// for debugging @SuppressWarnings("unused") static String brToString(BytesRef b) { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } }
// for debugging String brToString(BytesRef b) { if (b == null) { return "null"; } else { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } } }
/** * Returns human-readable form of the term text. If the term is not unicode, the raw bytes will be * printed instead. */ public static final String toString(BytesRef termText) { // the term might not be text, but usually is. so we make a best effort CharsetDecoder decoder = IOUtils.CHARSET_UTF_8 .newDecoder() .onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); try { return decoder .decode(ByteBuffer.wrap(termText.bytes, termText.offset, termText.length)) .toString(); } catch (CharacterCodingException e) { return termText.toString(); } }
/** * Converts the list of Tokens to a list of NamedLists representing the tokens. * * @param tokens Tokens to convert * @param context The analysis context * @return List of NamedLists containing the relevant information taken from the tokens */ private List<NamedList> convertTokensToNamedLists( final List<AttributeSource> tokens, AnalysisContext context) { final List<NamedList> tokensNamedLists = new ArrayList<NamedList>(); final int[] positions = new int[tokens.size()]; int position = 0; for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement(); positions[i] = position; } // sort the tokens by absoulte position new SorterTemplate() { @Override protected void swap(int i, int j) { final int p = positions[i]; positions[i] = positions[j]; positions[j] = p; Collections.swap(tokens, i, j); } @Override protected int compare(int i, int j) { return positions[i] - positions[j]; } @Override protected void setPivot(int i) { pivot = positions[i]; } @Override protected int comparePivot(int j) { return pivot - positions[j]; } private int pivot; }.mergeSort(0, tokens.size() - 1); FieldType fieldType = context.getFieldType(); final CharArr textBuf = new CharArr(); for (int i = 0, c = tokens.size(); i < c; i++) { AttributeSource token = tokens.get(i); final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>(); final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class); BytesRef rawBytes = termAtt.getBytesRef(); termAtt.fillBytesRef(); textBuf.reset(); fieldType.indexedToReadable(rawBytes, textBuf); final String text = textBuf.toString(); tokenNamedList.add("text", text); if (token.hasAttribute(CharTermAttribute.class)) { final String rawText = token.getAttribute(CharTermAttribute.class).toString(); if (!rawText.equals(text)) { tokenNamedList.add("raw_text", rawText); } } tokenNamedList.add("raw_bytes", rawBytes.toString()); if (context.getTermsToMatch().contains(rawBytes)) { tokenNamedList.add("match", true); } tokenNamedList.add("position", positions[i]); token.reflectWith( new AttributeReflector() { public void reflect(Class<? extends Attribute> attClass, String key, Object value) { // leave out position and bytes term if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return; if (CharTermAttribute.class.isAssignableFrom(attClass)) return; if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return; String k = attClass.getName() + '#' + key; // map keys for "standard attributes": if (ATTRIBUTE_MAPPING.containsKey(k)) { k = ATTRIBUTE_MAPPING.get(k); } if (value instanceof Payload) { final Payload p = (Payload) value; value = new BytesRef(p.getData()).toString(); } tokenNamedList.add(k, value); } }); tokensNamedLists.add(tokenNamedList); } return tokensNamedLists; }
public SparseInstances readIndex(String indexPath, String destFile, int threshold) throws Exception { if (indexPath == null || destFile == null) { System.out.println("error: indexPath or destFile is null\n"); return null; } DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey); int capacity = (int) terms.size(); HashMap<String, Integer> wordDict = new HashMap<>(capacity); capacity = capacity > 65535 ? 65535 : capacity; SparseInstances instData = new SparseInstances(capacity, reader.numDocs()); TermsEnum termsEnum = terms.iterator(); int index = 0; BytesRef term = null; String strTerm = null; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); if (termsEnum.totalTermFreq() < threshold) { continue; } if (strTerm.isEmpty()) { continue; } if (wordDict.get(strTerm) != null) { continue; } instData.addAttribute(strTerm); index++; } int numAtt = instData.numAttributes(); int numInst = instData.numInstances(); Integer attIndex = null; String id = null; int termIndex = 0; for (int docIndex = 0; docIndex < numInst; docIndex++) { id = reader.document(docIndex).getField(idKey).stringValue(); Terms docTerms = reader.getTermVector(docIndex, reviewKey); if (docTerms == null) { continue; } int[] indices = new int[(int) docTerms.size()]; double[] attValues = new double[(int) docTerms.size()]; termsEnum = docTerms.iterator(); termIndex = 0; while ((term = termsEnum.next()) != null) { strTerm = term.toString(); attIndex = wordDict.get(strTerm); if (attIndex == null) { continue; } indices[termIndex] = attIndex.intValue(); attValues[termIndex] = termsEnum.totalTermFreq(); } ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt); instData.addInstance(instance); } return null; }