Пример #1
0
 @Override
 public Sequence<IString> process(String input) {
   String tokenizerInput = toUncased(input.trim());
   Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
   List<String> outputStrings = new LinkedList<>();
   while (tokenizer.hasNext()) {
     String string = tokenizer.next().get(TextAnnotation.class);
     outputStrings.add(string);
   }
   return IStrings.toIStringSequence(outputStrings);
 }
Пример #2
0
  @Override
  public SymmetricalWordAlignment processAndAlign(String input) {
    input = input.trim();

    // Run through the tokenizer and convert to sequence
    String tokenizerInput = toUncased(input);
    String[] uncasedInputTokens = tokenizerInput.split("\\s+");
    Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
    List<CoreLabel> outputTokens = tokenizer.tokenize();
    IString[] outputSequence = new IString[outputTokens.size()];
    for (int i = 0; i < outputSequence.length; ++i) {
      String outputToken = outputTokens.get(i).get(TextAnnotation.class);
      outputSequence[i] = new IString(outputToken);
    }

    // Whitespace tokenization of input, create alignment
    Sequence<IString> inputSequence = IStrings.tokenize(input);
    assert inputSequence.size() == uncasedInputTokens.length;
    SymmetricalWordAlignment alignment =
        new SymmetricalWordAlignment(
            inputSequence, new SimpleSequence<IString>(true, outputSequence));

    // Generate the alignments
    StringBuilder inputToken = new StringBuilder();
    for (int i = 0, j = 0, limit = outputTokens.size(); j < limit; ++j) {
      CoreLabel tokenizedToken = outputTokens.get(j);
      String inputTokenPart = toUncased(tokenizedToken.get(OriginalTextAnnotation.class));
      alignment.addAlign(i, j);

      inputToken.append(inputTokenPart);
      if (i >= uncasedInputTokens.length) {
        System.err.println("WARNING: Non-invertible input: " + input);
        break;
      }
      if (uncasedInputTokens[i].equals(inputToken.toString())) {
        ++i;
        inputToken = new StringBuilder();
      }
    }
    return alignment;
  }