Beispiel #1
0
  /** Performs tokenization and saves the results to the <code>context</code>. */
  public void tokenize(PreprocessingContext context) {
    // Documents to tokenize
    final List<Document> documents = context.documents;

    // Fields to tokenize
    final String[] fieldNames = documentFields.toArray(new String[documentFields.size()]);

    if (fieldNames.length > 8) {
      throw new ProcessingException("Maximum number of tokenized fields is 8.");
    }

    // Prepare arrays
    images = Lists.newArrayList();
    tokenTypes = new ShortArrayList();
    documentIndices = new IntArrayList();
    fieldIndices = new ByteArrayList();

    final Iterator<Document> docIterator = documents.iterator();
    int documentIndex = 0;
    final ITokenizer ts = context.language.getTokenizer();
    final MutableCharArray wrapper = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);

    while (docIterator.hasNext()) {
      final Document doc = docIterator.next();

      boolean hadTokens = false;
      for (int i = 0; i < fieldNames.length; i++) {
        final byte fieldIndex = (byte) i;
        final String fieldName = fieldNames[i];
        final String fieldValue = doc.getField(fieldName);

        if (!StringUtils.isEmpty(fieldValue)) {
          try {
            short tokenType;

            ts.reset(new StringReader(fieldValue));
            if ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF) {
              if (hadTokens) addFieldSeparator(documentIndex);
              do {
                ts.setTermBuffer(wrapper);
                add(documentIndex, fieldIndex, context.intern(wrapper), tokenType);
              } while ((tokenType = ts.nextToken()) != ITokenizer.TT_EOF);
              hadTokens = true;
            }
          } catch (IOException e) {
            // Not possible (StringReader above)?
            throw ExceptionUtils.wrapAsRuntimeException(e);
          }
        }
      }

      if (docIterator.hasNext()) {
        addDocumentSeparator();
      }

      documentIndex++;
    }

    addTerminator();

    // Save results in the PreprocessingContext
    context.allTokens.documentIndex = documentIndices.toArray();
    context.allTokens.fieldIndex = fieldIndices.toArray();
    context.allTokens.image = images.toArray(new char[images.size()][]);
    context.allTokens.type = tokenTypes.toArray();
    context.allFields.name = fieldNames;

    // Clean up
    images = null;
    fieldIndices = null;
    tokenTypes = null;
    documentIndices = null;
  }
Beispiel #2
0
 /** Adds custom token code to the sequence. May be used to add separator constants. */
 void add(int documentIndex, byte fieldIndex, char[] image, short tokenTypeCode) {
   documentIndices.add(documentIndex);
   fieldIndices.add(fieldIndex);
   images.add(image);
   tokenTypes.add(tokenTypeCode);
 }