/** * This method creates a list of trivially HierarchicalPhrases (i.e. they're really just * contiguous phrases, but we will want to perform some of the HierarchialPhrase operations on * them). Sorts the positions. Adds the results to the cache. * * <p>The construction of more complex hierarchical phrases is handled within the prefix tree. * * <p>This method performs deterministic sampling, as described in Lopez (2008) p59: * * <blockquote> * * To resolve this issue, we used deterministic sampling. Whenever a source phrase occurs more * frequently than the maximum sample size, we take our samples at uniform intervals over the set * of locations returned by the suffix array. With this strategy in place, hypotheses receive the * same feature weights between different runs of the decoder, the results are deterministic, and * the MERT algorithm converges at the same rate as it does without sampling. * * </blockquote> * * @param startPositions an unsorted list of the positions in the corpus where the matched phrases * begin * @param pattern a contiguous phrase * @return a list of trivially hierarchical phrases */ protected HierarchicalPhrases createHierarchicalPhrases( int[] startPositions, Pattern pattern, PrefixTree prefixTree) { if (startPositions == null) { return HierarchicalPhrases.emptyList(prefixTree); } else if (hierarchicalPhraseCache.containsKey(pattern)) { return hierarchicalPhraseCache.get(pattern); } else { Arrays.sort(startPositions); // int length = pattern.size(); HierarchicalPhrases hierarchicalPhrases = new HierarchicalPhrases(pattern, startPositions, prefixTree); // ArrayList<HierarchicalPhrase> hierarchicalPhrases = new // ArrayList<HierarchicalPhrase>(startPositions.length); // // int step = //(startPositions.length<sampleSize) ? 1 : startPositions.length / sampleSize; // 1; // for(int i = 0; i < startPositions.length; i+=step) { // int[] position = {startPositions[i]}; // int[] endPosition = {startPositions[i] + length}; // HierarchicalPhrase hierarchicalPhrase = new HierarchicalPhrase(pattern, position, // endPosition, corpus, length); // hierarchicalPhrases.add(hierarchicalPhrase); // } hierarchicalPhraseCache.put(pattern, hierarchicalPhrases); return hierarchicalPhrases; } }
/** * @return a list of hierarchical phrases that match the pattern if they are already cached or * null if the pattern is not in the cache. */ public HierarchicalPhrases getMatchingPhrases(Pattern pattern) { return hierarchicalPhraseCache.get(pattern); }