/** Build the cluster's label from suffix tree edge indices. */ private String buildLabel(int[] phraseIndices) { final boolean joinWithSpace = context.language.getLanguageCode() != LanguageCode.CHINESE_SIMPLIFIED; // Count the number of terms first. int termsCount = 0; for (int j = 0; j < phraseIndices.length; j += 2) { termsCount += phraseIndices[j + 1] - phraseIndices[j] + 1; } // Extract terms info for the phrase and construct the label. final boolean[] stopwords = new boolean[termsCount]; final char[][] images = new char[termsCount][]; final short[] tokenTypes = context.allWords.type; int k = 0; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) { final int termIndex = sb.input.get(j); images[k] = context.allWords.image[termIndex]; stopwords[k] = TokenTypeUtils.isCommon(tokenTypes[termIndex]); } } return LabelFormatter.format(images, stopwords, joinWithSpace); }
/** Calculate "effective phrase length", that is the number of non-ignored words in the phrase. */ final int effectivePhraseLength(IntStack path) { final int[] terms = sb.input.buffer; final int lower = ignoreWordIfInFewerDocs; final int upper = (int) (ignoreWordIfInHigherDocsPercent * documents.size()); int effectivePhraseLen = 0; for (int i = 0; i < path.size(); i += 2) { for (int j = path.get(i); j <= path.get(i + 1); j++) { final int termIndex = terms[j]; // If this term is a stop word, don't count it. if (TokenTypeUtils.isCommon(context.allWords.type[termIndex])) { continue; } // If this word occurs in more than a given fraction of the input // collection don't count it. final int docCount = context.allWords.tfByDocument[termIndex].length / 2; if (docCount < lower || docCount > upper) { continue; } effectivePhraseLen++; } } return effectivePhraseLen; }
/** Collect all unique non-stop word from a phrase. */ private void appendUniqueWords(IntStack words, IntStack offsets, PhraseCandidate p) { assert p.cluster.phrases.size() == 1; final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } // Sort words, we don't care about their order when counting subsets. Arrays.sort(words.buffer, start, words.size()); // Reorder to keep only unique words. int j = start; for (int i = start + 1; i < words.size(); i++) { if (words.buffer[j] != words.buffer[i]) { words.buffer[++j] = words.buffer[i]; } } words.elementsCount = j + 1; offsets.push(start, words.size() - start); }
/** * Computes stem indices of words that are one-word label candidates or are non-stop words from * phrase label candidates. */ private int[] computeRequiredStemIndices(PreprocessingContext context) { final int[] labelsFeatureIndex = context.allLabels.featureIndex; final int[] wordsStemIndex = context.allWords.stemIndex; final short[] wordsTypes = context.allWords.type; final int[][] phrasesWordIndices = context.allPhrases.wordIndices; final int wordCount = wordsStemIndex.length; final int[][] stemsTfByDocument = context.allStems.tfByDocument; int documentCount = context.documents.size(); final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length); for (int i = 0; i < labelsFeatureIndex.length; i++) { final int featureIndex = labelsFeatureIndex[i]; if (featureIndex < wordCount) { addStemIndex( wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex); } else { final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount]; for (int j = 0; j < wordIndices.length; j++) { final int wordIndex = wordIndices[j]; if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) { addStemIndex( wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex); } } } } return requiredStemIndices.asIntLookupContainer().toArray(); }
/** * Consider certain special cases of internal suffix tree nodes. The suffix tree may contain * internal nodes with paths starting or ending with a stop word (common word). We have the * following interesting scenarios: * * <dl> * <dt>IF LEADING STOPWORD: IGNORE THE NODE. * <dd>There MUST be a phrase with this stopword chopped off in the suffix tree (a suffix of * this phrase) and its frequency will be just as high. * <dt>IF TRAILING STOPWORDS: * <dd>Check if the edge leading to the current node is composed entirely of stopwords. If so, * there must be a parent node that contains non-stopwords and we can ignore the current * node. Otherwise we can chop off the trailing stopwords from the current node's phrase * (this phrase cannot be duplicated anywhere in the tree because if it were, there would * have to be a branch somewhere in the suffix tree on the edge). * </dl> */ final boolean checkAcceptablePhrase(IntStack path) { assert path.size() > 0; final int[] terms = sb.input.buffer; final short[] tokenTypes = context.allWords.type; // Ignore nodes that start with a stop word. if (TokenTypeUtils.isCommon(tokenTypes[terms[path.get(0)]])) { return false; } // Check the last edge of the current node. int i = path.get(path.size() - 2); int j = path.get(path.size() - 1); final int k = j; while (i <= j && TokenTypeUtils.isCommon(tokenTypes[terms[j]])) { j--; } if (j < i) { // If the edge contains only stopwords, ignore the node. return false; } else if (j < k) { // There have been trailing stop words on the edge. Chop them off. path.buffer[path.size() - 1] = j; } // Check the total phrase length (in words, including stopwords). int termsCount = 0; for (j = 0; j < path.size(); j += 2) { termsCount += path.get(j + 1) - path.get(j) + 1; } if (termsCount > maxDescPhraseLength) { return false; } return true; }
/** Collect all words from a phrase. */ private void appendWords(IntStack words, IntStack offsets, PhraseCandidate p) { final int start = words.size(); final int[] phraseIndices = p.cluster.phrases.get(0); final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++) { final int termIndex = sb.input.get(j); if (!TokenTypeUtils.isCommon(tokenTypes[termIndex])) { words.push(termIndex); } } } offsets.push(start, words.size() - start); }
/** * Build a cluster's label from suffix tree edge indices, including some debugging and diagnostic * information. */ @SuppressWarnings("unused") private String buildDebugLabel(int[] phraseIndices) { final StringBuilder b = new StringBuilder(); String sep = ""; int k = 0; final short[] tokenTypes = context.allWords.type; for (int i = 0; i < phraseIndices.length; i += 2) { for (int j = phraseIndices[i]; j <= phraseIndices[i + 1]; j++, k++) { b.append(sep); final int termIndex = sb.input.get(j); b.append(context.allWords.image[termIndex]); if (TokenTypeUtils.isCommon(tokenTypes[termIndex])) b.append("[S]"); sep = " "; } sep = "_"; } return b.toString(); }