/** * Create a new JapaneseTokenizer. * * @param factory the AttributeFactory to use * @param userDictionary Optional: if non-null, user dictionary. * @param discardPunctuation true if punctuation tokens should be dropped from the output. * @param mode tokenization mode. */ public JapaneseTokenizer( AttributeFactory factory, UserDictionary userDictionary, boolean discardPunctuation, Mode mode) { super(factory); dictionary = TokenInfoDictionary.getInstance(); fst = dictionary.getFST(); unkDictionary = UnknownDictionary.getInstance(); characterDefinition = unkDictionary.getCharacterDefinition(); this.userDictionary = userDictionary; costs = ConnectionCosts.getInstance(); fstReader = fst.getBytesReader(); if (userDictionary != null) { userFST = userDictionary.getFST(); userFSTReader = userFST.getBytesReader(); } else { userFST = null; userFSTReader = null; } this.discardPunctuation = discardPunctuation; switch (mode) { case SEARCH: searchMode = true; extendedMode = false; outputCompounds = true; break; case EXTENDED: searchMode = true; extendedMode = true; outputCompounds = false; break; default: searchMode = false; extendedMode = false; outputCompounds = false; break; } buffer.reset(this.input); resetState(); dictionaryMap.put(Type.KNOWN, dictionary); dictionaryMap.put(Type.UNKNOWN, unkDictionary); dictionaryMap.put(Type.USER, userDictionary); }
/* Incrementally parse some more characters. This runs * the viterbi search forwards "enough" so that we * generate some more tokens. How much forward depends on * the chars coming in, since some chars could cause * longer-lasting ambiguity in the parsing. Once the * ambiguity is resolved, then we back trace, produce * the pending tokens, and return. */ private void parse() throws IOException { if (VERBOSE) { System.out.println("\nPARSE"); } // Advances over each position (character): while (true) { if (buffer.get(pos) == -1) { // End break; } final Position posData = positions.get(pos); final boolean isFrontier = positions.getNextPos() == pos + 1; if (posData.count == 0) { // No arcs arrive here; move to next position: if (VERBOSE) { System.out.println(" no arcs in; skip pos=" + pos); } pos++; continue; } if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { // if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { // We are at a "frontier", and only one node is // alive, so whatever the eventual best path is must // come through this node. So we can safely commit // to the prefix of the best path at this point: backtrace(posData, 0); // Re-base cost so we don't risk int overflow: posData.costs[0] = 0; if (pending.size() != 0) { return; } else { // This means the backtrace only produced // punctuation tokens, so we must keep parsing. } } if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) { // Safety: if we've buffered too much, force a // backtrace now. We find the least-cost partial // path, across all paths, backtrace from it, and // then prune all others. Note that this, in // general, can produce the wrong result, if the // total best path did not in fact back trace // through this partial best path. But it's the // best we can do... (short of not having a // safety!). // First pass: find least cost partial path so far, // including ending at future positions: int leastIDX = -1; int leastCost = Integer.MAX_VALUE; Position leastPosData = null; for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) { final Position posData2 = positions.get(pos2); for (int idx = 0; idx < posData2.count; idx++) { // System.out.println(" idx=" + idx + " cost=" + cost); final int cost = posData2.costs[idx]; if (cost < leastCost) { leastCost = cost; leastIDX = idx; leastPosData = posData2; } } } // We will always have at least one live path: assert leastIDX != -1; // Second pass: prune all but the best path: for (int pos2 = pos; pos2 < positions.getNextPos(); pos2++) { final Position posData2 = positions.get(pos2); if (posData2 != leastPosData) { posData2.reset(); } else { if (leastIDX != 0) { posData2.costs[0] = posData2.costs[leastIDX]; posData2.lastRightID[0] = posData2.lastRightID[leastIDX]; posData2.backPos[0] = posData2.backPos[leastIDX]; posData2.backIndex[0] = posData2.backIndex[leastIDX]; posData2.backID[0] = posData2.backID[leastIDX]; posData2.backType[0] = posData2.backType[leastIDX]; } posData2.count = 1; } } backtrace(leastPosData, 0); // Re-base cost so we don't risk int overflow: Arrays.fill(leastPosData.costs, 0, leastPosData.count, 0); if (pos != leastPosData.pos) { // We jumped into a future position: assert pos < leastPosData.pos; pos = leastPosData.pos; } if (pending.size() != 0) { return; } else { // This means the backtrace only produced // punctuation tokens, so we must keep parsing. continue; } } if (VERBOSE) { System.out.println("\n extend @ pos=" + pos + " char=" + (char) buffer.get(pos)); } if (VERBOSE) { System.out.println(" " + posData.count + " arcs in"); } boolean anyMatches = false; // First try user dict: if (userFST != null) { userFST.getFirstArc(arc); int output = 0; for (int posAhead = posData.pos; ; posAhead++) { final int ch = buffer.get(posAhead); if (ch == -1) { break; } if (userFST.findTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) { break; } output += arc.output.intValue(); if (arc.isFinal()) { if (VERBOSE) { System.out.println( " USER word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1)); } add( userDictionary, posData, posAhead + 1, output + arc.nextFinalOutput.intValue(), Type.USER, false); anyMatches = true; } } } // TODO: we can be more aggressive about user // matches? if we are "under" a user match then don't // extend KNOWN/UNKNOWN paths? if (!anyMatches) { // Next, try known dictionary matches fst.getFirstArc(arc); int output = 0; for (int posAhead = posData.pos; ; posAhead++) { final int ch = buffer.get(posAhead); if (ch == -1) { break; } // System.out.println(" match " + (char) ch + " posAhead=" + posAhead); if (fst.findTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) { break; } output += arc.output.intValue(); // Optimization: for known words that are too-long // (compound), we should pre-compute the 2nd // best segmentation and store it in the // dictionary instead of recomputing it each time a // match is found. if (arc.isFinal()) { dictionary.lookupWordIds(output + arc.nextFinalOutput.intValue(), wordIdRef); if (VERBOSE) { System.out.println( " KNOWN word " + new String(buffer.get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.length + " wordIDs"); } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( dictionary, posData, posAhead + 1, wordIdRef.ints[wordIdRef.offset + ofs], Type.KNOWN, false); anyMatches = true; } } } } // In the case of normal mode, it doesn't process unknown word greedily. if (!searchMode && unknownWordEndIndex > posData.pos) { pos++; continue; } final char firstCharacter = (char) buffer.get(pos); if (!anyMatches || characterDefinition.isInvoke(firstCharacter)) { // Find unknown match: final int characterId = characterDefinition.getCharacterClass(firstCharacter); final boolean isPunct = isPunctuation(firstCharacter); // NOTE: copied from UnknownDictionary.lookup: int unknownWordLength; if (!characterDefinition.isGroup(firstCharacter)) { unknownWordLength = 1; } else { // Extract unknown word. Characters with the same character class are considered to be // part of unknown word unknownWordLength = 1; for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) { final int ch = buffer.get(posAhead); if (ch == -1) { break; } if (characterId == characterDefinition.getCharacterClass((char) ch) && isPunctuation((char) ch) == isPunct) { unknownWordLength++; } else { break; } } } unkDictionary.lookupWordIds( characterId, wordIdRef); // characters in input text are supposed to be the same if (VERBOSE) { System.out.println( " UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.length + " wordIDs"); } for (int ofs = 0; ofs < wordIdRef.length; ofs++) { add( unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.ints[wordIdRef.offset + ofs], Type.UNKNOWN, false); } unknownWordEndIndex = posData.pos + unknownWordLength; } pos++; } end = true; if (pos > 0) { final Position endPosData = positions.get(pos); int leastCost = Integer.MAX_VALUE; int leastIDX = -1; if (VERBOSE) { System.out.println(" end: " + endPosData.count + " nodes"); } for (int idx = 0; idx < endPosData.count; idx++) { // Add EOS cost: final int cost = endPosData.costs[idx] + costs.get(endPosData.lastRightID[idx], 0); // System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + // endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") // backPos=" + endPosData.backPos[idx]); if (cost < leastCost) { leastCost = cost; leastIDX = idx; } } backtrace(endPosData, leastIDX); } else { // No characters in the input string; return no tokens! } }