public HashSet<NGramSet> findCommonNGrams( String string1, String string2, int min, int max, boolean maximizePrimaryWindowSize) { ordered_scores = new TreeMap<Double, Integer>(); errors = new ArrayList<Error>(); HashSet<NGramSet> NGramsWithMatches = new HashSet<NGramSet>(); // ensure that min <= max if (min > max) { int temp = max; max = min; min = temp; logError("Min greater than max; assuming the opposite parameterization"); } char[] chars1 = string1.toCharArray(); char[] chars2 = string2.toCharArray(); List<String> words1 = scanForWords(chars1); List<String> words2 = scanForWords(chars2); // when testing, restrict the length of documents to be small if (isTesting) { int maxSub = 1000; words1 = words1.subList(0, maxSizeOutOfRangeForSource(maxSub, words1) ? words1.size() : maxSub); words2 = words2.subList(0, maxSizeOutOfRangeForSource(maxSub, words2) ? words2.size() : maxSub); } NGramSetImpl.setMatchCase(matchCase); NGramSetImpl.setUseStopWords(USESTOPWORDS); NGramSetImpl.setStrictness(STRICT); NGramSetImpl.setMinSize(min); int leftMax = (words1.size() <= max || maximizePrimaryWindowSize) ? words1.size() : max; int rightMax = (words2.size() <= max) ? words2.size() - 1 : max; HashMap<String, List<NGramSet>> map = new HashMap<String, List<NGramSet>>(); if (rightMax < max) { logError( "Window size greater than number of length of secondary text; decreasing secondary window size to: " + rightMax); } if (leftMax < max && maximizePrimaryWindowSize) { logError("Maximizing primary window"); } else if (leftMax < max) { logError("Max out of range for primary source. Scaling down to: " + leftMax); } ArrayList<NGramSet> nGrams1 = null; nGrams1 = getAllNGramsOfSize(words1, leftMax, null); // ArrayList<NGramSet> nGrams2 = getAllNGramsOfSize(words2, rightMax, map); findAllCommon(NGramsWithMatches, nGrams1, map); return NGramsWithMatches; }
private ArrayList<NGramSet> getAllNGramsOfSize( List<String> words, int size, HashMap<String, List<NGramSet>> map) { String processedWord = null; ArrayList<NGramSet> sets = new ArrayList<NGramSet>(words.size()); final int documentSize = words.size(); if (usePorterStemmer) { NGramSetImplStemmed current = new NGramSetImplStemmed(size); current.setMaxSize(size); current.setDocument(words); for (int i = 0; i < size && i < documentSize; i++) { processedWord = current.processWord(words.get(i)); // if map is null, then tracking doesn't matter // if processWord was null, it was a stop-word if (map != null && processedWord != null) { // System.out.println("Mapping: " + current.toString() + " for " // + processedWord); List<NGramSet> nGrams = map.get(processedWord); if (nGrams != null) { final int prevSize = nGrams.size(); nGrams.add(current); assert (prevSize != map.get(processedWord).size()); } else { List<NGramSet> l = new ArrayList<NGramSet>(); l.add(current); map.put(processedWord, l); } } } sets.add(current); NGramSetImplStemmed prev = current; for (int i = size; i < documentSize; i++) { current = new NGramSetImplStemmed((NGramSet) prev); processedWord = current.processWord(words.get(i)); current.popFirstWord(); sets.add(current); prev = current; if (map == null || processedWord == null) continue; List<String> relevantWords = current.getModifiedWordList(); for (String relevantWord : relevantWords) { List<NGramSet> nGrams = map.get(relevantWord); if (nGrams != null) { nGrams.add(current); } else { List<NGramSet> l = new ArrayList<NGramSet>(); l.add(current); map.put(relevantWord, l); } } } } else { NGramSetImpl current = new NGramSetImpl(size); current.setDocument(words); for (int i = 0; i < size && i < documentSize; i++) { processedWord = current.processWord(words.get(i)); if (map != null && processedWord != null) { List<NGramSet> nGrams = map.get(processedWord); if (nGrams != null) { nGrams.add(current); } else { List<NGramSet> l = new ArrayList<NGramSet>(); l.add(current); map.put(processedWord, l); } } } sets.add(current); NGramSetImpl prev = current; for (int i = size; i < documentSize; i++) { current = new NGramSetImpl((NGramSet) prev); processedWord = current.processWord(words.get(i)); current.popFirstWord(); sets.add(current); prev = current; if (map == null || processedWord == null) continue; List<String> relevantWords = current.getModifiedWordList(); for (String relevantWord : relevantWords) { List<NGramSet> nGrams = map.get(relevantWord); if (nGrams != null) { nGrams.add(current); } else { List<NGramSet> l = new ArrayList<NGramSet>(); l.add(current); map.put(relevantWord, l); } } } } // if(map != null) System.out.println("Map size: " + // map.entrySet().size()); return sets; }