private LinkedList<ViterbiNode> convertUnknownWordToUnigramNode(ViterbiNode node) { LinkedList<ViterbiNode> uniGramNodes = new LinkedList<>(); int unigramWordId = 0; String surface = node.getSurface(); for (int i = surface.length(); i > 0; i--) { String word = surface.substring(i - 1, i); int startIndex = node.getStartIndex() + i - 1; ViterbiNode uniGramNode = new ViterbiNode( unigramWordId, word, unknownDictionary, startIndex, ViterbiNode.Type.UNKNOWN); uniGramNodes.addFirst(uniGramNode); } return uniGramNodes; }
private void updateNode(ViterbiNode[] viterbiNodes, ViterbiNode node) { int backwardConnectionId = node.getLeftId(); int wordCost = node.getWordCost(); int leastPathCost = DEFAULT_COST; for (ViterbiNode leftNode : viterbiNodes) { // If array doesn't contain any more ViterbiNodes, continue to next index if (leftNode == null) { return; } else { // cost = [total cost from BOS to previous node] + [connection cost between previous node // and current node] + [word cost] int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost; // Add extra cost for long nodes in "Search mode". if (mode == TokenizerBase.Mode.SEARCH || mode == TokenizerBase.Mode.EXTENDED) { pathCost += getPenaltyCost(node); } // If total cost is lower than before, set current previous node as best left node (previous // means left). if (pathCost < leastPathCost) { leastPathCost = pathCost; node.setPathCost(leastPathCost); node.setLeftNode(leftNode); } } } }
private int getPenaltyCost(ViterbiNode node) { int pathCost = 0; String surface = node.getSurface(); int length = surface.length(); if (length > kanjiPenaltyLengthTreshold) { if (isKanjiOnly(surface)) { // Process only Kanji keywords pathCost += (length - kanjiPenaltyLengthTreshold) * kanjiPenalty; } else if (length > otherPenaltyLengthThreshold) { pathCost += (length - otherPenaltyLengthThreshold) * otherPenalty; } } return pathCost; }
private LinkedList<ViterbiNode> backtrackBestPath(ViterbiNode eos) { ViterbiNode node = eos; LinkedList<ViterbiNode> result = new LinkedList<>(); result.add(node); while (true) { ViterbiNode leftNode = node.getLeftNode(); if (leftNode == null) { break; } else { // Extended mode converts unknown word into unigram nodes if (mode == TokenizerBase.Mode.EXTENDED && leftNode.getType() == ViterbiNode.Type.UNKNOWN) { LinkedList<ViterbiNode> uniGramNodes = convertUnknownWordToUnigramNode(leftNode); result.addAll(uniGramNodes); } else { result.addFirst(leftNode); } node = leftNode; } } return result; }