private static boolean isPunctuation(char ch) { switch (Character.getType(ch)) { case Character.SPACE_SEPARATOR: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.CONTROL: case Character.FORMAT: case Character.DASH_PUNCTUATION: case Character.START_PUNCTUATION: case Character.END_PUNCTUATION: case Character.CONNECTOR_PUNCTUATION: case Character.OTHER_PUNCTUATION: case Character.MATH_SYMBOL: case Character.CURRENCY_SYMBOL: case Character.MODIFIER_SYMBOL: case Character.OTHER_SYMBOL: case Character.INITIAL_QUOTE_PUNCTUATION: case Character.FINAL_QUOTE_PUNCTUATION: return true; default: return false; } }
// Backtrace from the provided position, back to the last // time we back-traced, accumulating the resulting tokens to // the pending list. The pending list is then in-reverse // (last token should be returned first). private void backtrace(final Position endPosData, final int fromIDX) throws IOException { final int endPos = endPosData.pos; if (VERBOSE) { System.out.println( "\n backtrace: endPos=" + endPos + " pos=" + pos + "; " + (pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); } final char[] fragment = buffer.get(lastBackTracePos, endPos - lastBackTracePos); if (dotOut != null) { dotOut.onBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); } int pos = endPos; int bestIDX = fromIDX; Token altToken = null; // We trace backwards, so this will be the leftWordID of // the token after the one we are now on: int lastLeftWordID = -1; int backCount = 0; // TODO: sort of silly to make Token instances here; the // back trace has all info needed to generate the // token. So, we could just directly set the attrs, // from the backtrace, in incrementToken w/o ever // creating Token; we'd have to defer calling freeBefore // until after the backtrace was fully "consumed" by // incrementToken. while (pos > lastBackTracePos) { // System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); final Position posData = positions.get(pos); assert bestIDX < posData.count; int backPos = posData.backPos[bestIDX]; assert backPos >= lastBackTracePos : "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos; int length = pos - backPos; Type backType = posData.backType[bestIDX]; int backID = posData.backID[bestIDX]; int nextBestIDX = posData.backIndex[bestIDX]; if (outputCompounds && searchMode && altToken == null && backType != Type.USER) { // In searchMode, if best path had picked a too-long // token, we use the "penalty" to compute the allowed // max cost of an alternate back-trace. If we find an // alternate back trace with cost below that // threshold, we pursue it instead (but also output // the long token). // System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); final int penalty = computeSecondBestThreshold(backPos, pos - backPos); if (penalty > 0) { if (VERBOSE) { System.out.println( " compound=" + new String(buffer.get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); } // Use the penalty to set maxCost on the 2nd best // segmentation: int maxCost = posData.costs[bestIDX] + penalty; if (lastLeftWordID != -1) { maxCost += costs.get(getDict(backType).getRightId(backID), lastLeftWordID); } // Now, prune all too-long tokens from the graph: pruneAndRescore(backPos, pos, posData.backIndex[bestIDX]); // Finally, find 2nd best back-trace and resume // backtrace there: int leastCost = Integer.MAX_VALUE; int leastIDX = -1; for (int idx = 0; idx < posData.count; idx++) { int cost = posData.costs[idx]; // System.out.println(" idx=" + idx + " prevCost=" + cost); if (lastLeftWordID != -1) { cost += costs.get( getDict(posData.backType[idx]).getRightId(posData.backID[idx]), lastLeftWordID); // System.out.println(" += bgCost=" + // costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]), // lastLeftWordID) + " -> " + cost); } // System.out.println("penalty " + posData.backPos[idx] + " to " + pos); // cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); if (cost < leastCost) { // System.out.println(" ** "); leastCost = cost; leastIDX = idx; } } // System.out.println(" leastIDX=" + leastIDX); if (VERBOSE) { System.out.println( " afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); } if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) { // We should have pruned the altToken from the graph: assert posData.backPos[leastIDX] != backPos; // Save the current compound token, to output when // this alternate path joins back: altToken = new Token( backID, fragment, backPos - lastBackTracePos, length, backType, backPos, getDict(backType)); // Redirect our backtrace to 2nd best: bestIDX = leastIDX; nextBestIDX = posData.backIndex[bestIDX]; backPos = posData.backPos[bestIDX]; length = pos - backPos; backType = posData.backType[bestIDX]; backID = posData.backID[bestIDX]; backCount = 0; // System.out.println(" do alt token!"); } else { // I think in theory it's possible there is no // 2nd best path, which is fine; in this case we // only output the compound token: // System.out.println(" no alt token! bestIDX=" + bestIDX); } } } final int offset = backPos - lastBackTracePos; assert offset >= 0; if (altToken != null && altToken.getPosition() >= backPos) { // We've backtraced to the position where the // compound token starts; add it now: // The pruning we did when we created the altToken // ensures that the back trace will align back with // the start of the altToken: assert altToken.getPosition() == backPos : altToken.getPosition() + " vs " + backPos; // NOTE: not quite right: the compound token may // have had all punctuation back traced so far, but // then the decompounded token at this position is // not punctuation. In this case backCount is 0, // but we should maybe add the altToken anyway...? if (backCount > 0) { backCount++; altToken.setPositionLength(backCount); if (VERBOSE) { System.out.println(" add altToken=" + altToken); } pending.add(altToken); } else { // This means alt token was all punct tokens: if (VERBOSE) { System.out.println(" discard all-punctuation altToken=" + altToken); } assert discardPunctuation; } altToken = null; } final Dictionary dict = getDict(backType); if (backType == Type.USER) { // Expand the phraseID we recorded into the actual // segmentation: final int[] wordIDAndLength = userDictionary.lookupSegmentation(backID); int wordID = wordIDAndLength[0]; int current = 0; for (int j = 1; j < wordIDAndLength.length; j++) { final int len = wordIDAndLength[j]; // System.out.println(" add user: len=" + len); pending.add( new Token( wordID + j - 1, fragment, current + offset, len, Type.USER, current + backPos, dict)); if (VERBOSE) { System.out.println(" add USER token=" + pending.get(pending.size() - 1)); } current += len; } // Reverse the tokens we just added, because when we // serve them up from incrementToken we serve in // reverse: Collections.reverse( pending.subList(pending.size() - (wordIDAndLength.length - 1), pending.size())); backCount += wordIDAndLength.length - 1; } else { if (extendedMode && backType == Type.UNKNOWN) { // In EXTENDED mode we convert unknown word into // unigrams: int unigramTokenCount = 0; for (int i = length - 1; i >= 0; i--) { int charLen = 1; if (i > 0 && Character.isLowSurrogate(fragment[offset + i])) { i--; charLen = 2; } // System.out.println(" extended tok offset=" // + (offset + i)); if (!discardPunctuation || !isPunctuation(fragment[offset + i])) { pending.add( new Token( CharacterDefinition.NGRAM, fragment, offset + i, charLen, Type.UNKNOWN, backPos + i, unkDictionary)); unigramTokenCount++; } } backCount += unigramTokenCount; } else if (!discardPunctuation || length == 0 || !isPunctuation(fragment[offset])) { pending.add(new Token(backID, fragment, offset, length, backType, backPos, dict)); if (VERBOSE) { System.out.println(" add token=" + pending.get(pending.size() - 1)); } backCount++; } else { if (VERBOSE) { System.out.println( " skip punctuation token=" + new String(fragment, offset, length)); } } } lastLeftWordID = dict.getLeftId(backID); pos = backPos; bestIDX = nextBestIDX; } lastBackTracePos = endPos; if (VERBOSE) { System.out.println(" freeBefore pos=" + endPos); } // Notify the circular buffers that we are done with // these positions: buffer.freeBefore(endPos); positions.freeBefore(endPos); }