@Override public Sequence<IString> process(String input) { String tokenizerInput = toUncased(input.trim()); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<String> outputStrings = new LinkedList<>(); while (tokenizer.hasNext()) { String string = tokenizer.next().get(TextAnnotation.class); outputStrings.add(string); } return IStrings.toIStringSequence(outputStrings); }
@Override public SymmetricalWordAlignment processAndAlign(String input) { input = input.trim(); // Run through the tokenizer and convert to sequence String tokenizerInput = toUncased(input); String[] uncasedInputTokens = tokenizerInput.split("\\s+"); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<CoreLabel> outputTokens = tokenizer.tokenize(); IString[] outputSequence = new IString[outputTokens.size()]; for (int i = 0; i < outputSequence.length; ++i) { String outputToken = outputTokens.get(i).get(TextAnnotation.class); outputSequence[i] = new IString(outputToken); } // Whitespace tokenization of input, create alignment Sequence<IString> inputSequence = IStrings.tokenize(input); assert inputSequence.size() == uncasedInputTokens.length; SymmetricalWordAlignment alignment = new SymmetricalWordAlignment( inputSequence, new SimpleSequence<IString>(true, outputSequence)); // Generate the alignments StringBuilder inputToken = new StringBuilder(); for (int i = 0, j = 0, limit = outputTokens.size(); j < limit; ++j) { CoreLabel tokenizedToken = outputTokens.get(j); String inputTokenPart = toUncased(tokenizedToken.get(OriginalTextAnnotation.class)); alignment.addAlign(i, j); inputToken.append(inputTokenPart); if (i >= uncasedInputTokens.length) { System.err.println("WARNING: Non-invertible input: " + input); break; } if (uncasedInputTokens[i].equals(inputToken.toString())) { ++i; inputToken = new StringBuilder(); } } return alignment; }