// for debugging
 @SuppressWarnings("unused")
 static String brToString(BytesRef b) {
   try {
     return b.utf8ToString() + " " + b;
   } catch (Throwable t) {
     // If BytesRef isn't actually UTF8, or it's eg a
     // prefix of UTF8 that ends mid-unicode-char, we
     // fallback to hex:
     return b.toString();
   }
 }
 // for debugging
 String brToString(BytesRef b) {
   if (b == null) {
     return "null";
   } else {
     try {
       return b.utf8ToString() + " " + b;
     } catch (Throwable t) {
       // If BytesRef isn't actually UTF8, or it's eg a
       // prefix of UTF8 that ends mid-unicode-char, we
       // fallback to hex:
       return b.toString();
     }
   }
 }
Ejemplo n.º 3
0
 /**
  * Returns human-readable form of the term text. If the term is not unicode, the raw bytes will be
  * printed instead.
  */
 public static final String toString(BytesRef termText) {
   // the term might not be text, but usually is. so we make a best effort
   CharsetDecoder decoder =
       IOUtils.CHARSET_UTF_8
           .newDecoder()
           .onMalformedInput(CodingErrorAction.REPORT)
           .onUnmappableCharacter(CodingErrorAction.REPORT);
   try {
     return decoder
         .decode(ByteBuffer.wrap(termText.bytes, termText.offset, termText.length))
         .toString();
   } catch (CharacterCodingException e) {
     return termText.toString();
   }
 }
  /**
   * Converts the list of Tokens to a list of NamedLists representing the tokens.
   *
   * @param tokens Tokens to convert
   * @param context The analysis context
   * @return List of NamedLists containing the relevant information taken from the tokens
   */
  private List<NamedList> convertTokensToNamedLists(
      final List<AttributeSource> tokens, AnalysisContext context) {
    final List<NamedList> tokensNamedLists = new ArrayList<NamedList>();

    final int[] positions = new int[tokens.size()];
    int position = 0;
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      position += token.addAttribute(PositionIncrementAttribute.class).getPositionIncrement();
      positions[i] = position;
    }

    // sort the tokens by absoulte position
    new SorterTemplate() {
      @Override
      protected void swap(int i, int j) {
        final int p = positions[i];
        positions[i] = positions[j];
        positions[j] = p;
        Collections.swap(tokens, i, j);
      }

      @Override
      protected int compare(int i, int j) {
        return positions[i] - positions[j];
      }

      @Override
      protected void setPivot(int i) {
        pivot = positions[i];
      }

      @Override
      protected int comparePivot(int j) {
        return pivot - positions[j];
      }

      private int pivot;
    }.mergeSort(0, tokens.size() - 1);

    FieldType fieldType = context.getFieldType();

    final CharArr textBuf = new CharArr();
    for (int i = 0, c = tokens.size(); i < c; i++) {
      AttributeSource token = tokens.get(i);
      final NamedList<Object> tokenNamedList = new SimpleOrderedMap<Object>();
      final TermToBytesRefAttribute termAtt = token.getAttribute(TermToBytesRefAttribute.class);
      BytesRef rawBytes = termAtt.getBytesRef();
      termAtt.fillBytesRef();

      textBuf.reset();
      fieldType.indexedToReadable(rawBytes, textBuf);
      final String text = textBuf.toString();

      tokenNamedList.add("text", text);

      if (token.hasAttribute(CharTermAttribute.class)) {
        final String rawText = token.getAttribute(CharTermAttribute.class).toString();
        if (!rawText.equals(text)) {
          tokenNamedList.add("raw_text", rawText);
        }
      }

      tokenNamedList.add("raw_bytes", rawBytes.toString());

      if (context.getTermsToMatch().contains(rawBytes)) {
        tokenNamedList.add("match", true);
      }

      tokenNamedList.add("position", positions[i]);

      token.reflectWith(
          new AttributeReflector() {
            public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
              // leave out position and bytes term
              if (TermToBytesRefAttribute.class.isAssignableFrom(attClass)) return;
              if (CharTermAttribute.class.isAssignableFrom(attClass)) return;
              if (PositionIncrementAttribute.class.isAssignableFrom(attClass)) return;

              String k = attClass.getName() + '#' + key;

              // map keys for "standard attributes":
              if (ATTRIBUTE_MAPPING.containsKey(k)) {
                k = ATTRIBUTE_MAPPING.get(k);
              }

              if (value instanceof Payload) {
                final Payload p = (Payload) value;
                value = new BytesRef(p.getData()).toString();
              }

              tokenNamedList.add(k, value);
            }
          });

      tokensNamedLists.add(tokenNamedList);
    }

    return tokensNamedLists;
  }
Ejemplo n.º 5
0
  public SparseInstances readIndex(String indexPath, String destFile, int threshold)
      throws Exception {

    if (indexPath == null || destFile == null) {
      System.out.println("error: indexPath or destFile is null\n");
      return null;
    }

    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
    Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(reviewKey);

    int capacity = (int) terms.size();
    HashMap<String, Integer> wordDict = new HashMap<>(capacity);
    capacity = capacity > 65535 ? 65535 : capacity;
    SparseInstances instData = new SparseInstances(capacity, reader.numDocs());
    TermsEnum termsEnum = terms.iterator();
    int index = 0;
    BytesRef term = null;
    String strTerm = null;
    while ((term = termsEnum.next()) != null) {
      strTerm = term.toString();
      if (termsEnum.totalTermFreq() < threshold) {
        continue;
      }
      if (strTerm.isEmpty()) {
        continue;
      }
      if (wordDict.get(strTerm) != null) {
        continue;
      }
      instData.addAttribute(strTerm);
      index++;
    }
    int numAtt = instData.numAttributes();
    int numInst = instData.numInstances();
    Integer attIndex = null;
    String id = null;
    int termIndex = 0;
    for (int docIndex = 0; docIndex < numInst; docIndex++) {
      id = reader.document(docIndex).getField(idKey).stringValue();
      Terms docTerms = reader.getTermVector(docIndex, reviewKey);
      if (docTerms == null) {
        continue;
      }
      int[] indices = new int[(int) docTerms.size()];
      double[] attValues = new double[(int) docTerms.size()];
      termsEnum = docTerms.iterator();
      termIndex = 0;
      while ((term = termsEnum.next()) != null) {
        strTerm = term.toString();
        attIndex = wordDict.get(strTerm);
        if (attIndex == null) {
          continue;
        }
        indices[termIndex] = attIndex.intValue();
        attValues[termIndex] = termsEnum.totalTermFreq();
      }
      ESparseInstance instance = new ESparseInstance(id, 1.0, attValues, indices, numAtt);
      instData.addInstance(instance);
    }

    return null;
  }