/* * THIS IS NOT LONGER USED was just implemented in the first VERSION * Implementation of the inefficient algorithm given in the paper * Searches for matches of size l for string S, whose minmum overlap of ngrams is min_overlap, in the dictionary d * @param s string we are looking for * @param min_overlap min number of matching ngrams between s and Y, for Y to be a match for S * @param d dictionary * @param l size of Y, given that Y is a match for S * @return the list of Ids which are matches to S of size L and whose min number of common ngrams with S is min_overlap */ private static ArrayList<Integer> overlap( String s, int min_overlap, DictionaryGenerator d, int l) { // list of matches to return at the end ArrayList<Integer> listOfMatches = new ArrayList<Integer>(); // list of ngrams of the string S ArrayList<String> ngramList = Ngram.splitIntoNGrams(s, d.getNgramSize()); // count of how many times a string has matched, in order to make sure the overlap is higher // than min_overlap HashMap<Integer, Integer> m = new HashMap<Integer, Integer>(); for (int i = 0; i < ngramList.size(); i++) { String ngram = ngramList.get(i); ArrayList<Integer> matches_ = d.searchTerm(l, ngram); System.err.println( "looking for matches of ngram: " + ngramList.get(i) + " in strings of size: " + l + " --Matches:" + matches_.size()); HashSet<Integer> matches = new HashSet<Integer>(); for (int j = 0; j < matches_.size(); j++) { matches.add(matches_.get(j)); } Iterator it = matches.iterator(); while (it.hasNext()) { Integer currentMatch = (Integer) it.next(); int currentCount = 0; if (m.containsKey(currentMatch)) { currentCount = m.get(currentMatch); } else { m.put(currentMatch, 0); } int newValue = currentCount + 1; m.put(currentMatch, newValue); if (min_overlap <= newValue) { System.err.println("match:" + currentMatch + "--" + newValue); listOfMatches.add(currentMatch); } } } return listOfMatches; }
/* * Implementation of the efficient algorithm given in the paper * Searches for matches of size l for string S, whose minmum overlap of ngrams is min_overlap, in the dictionary d * @param s string we are looking for * @param min_overlap min number of matching ngrams between s and Y, for Y to be a match for S * @param d dictionary * @param l size of Y, given that Y is a match for S * @return the list of Ids which are matches to S of size L and whose min number of common ngrams with S is min_overlap */ private static ArrayList<Integer> overlap_nonnaive( String s, int min_overlap, DictionaryGenerator d, int l) { // list of matches to return at the end ArrayList<Integer> listOfMatches = new ArrayList<Integer>(); // list of ngrams of the string S ArrayList<String> ngramList = Ngram.splitIntoNGrams(s, d.getNgramSize()); // count of how many times a string has matched, in order to make sure the overlap is higher // than min_overlap HashMap<Integer, Integer> m = new HashMap<Integer, Integer>(); // order listOfNgrams increnmentally to the least common ngram to the most common PriorityQueue queueEntitiesPerNgram = new PriorityQueue(); for (int i = 0; i < ngramList.size(); i++) { ArrayList<Integer> tempListOfTermsForNgramI = d.searchTerm(l, ngramList.get(i)); Orderable<ArrayList<Integer>> orderable_temp = new Orderable<ArrayList<Integer>>( tempListOfTermsForNgramI, i, tempListOfTermsForNgramI.size()); queueEntitiesPerNgram.add(orderable_temp); } List<Orderable<ArrayList<Integer>>> queueIntoList = new LinkedList<Orderable<ArrayList<Integer>>>(queueEntitiesPerNgram); ArrayList<Orderable<ArrayList<Integer>>> listOfEntitiesPerNgram = new ArrayList<Orderable<ArrayList<Integer>>>(queueIntoList); int numberOfNgramsForS = Ngram.getNumberOfNgrams(s, d.getNgramSize()); for (int k = 0; k <= numberOfNgramsForS - min_overlap; k++) { for (int z = 0; z < listOfEntitiesPerNgram.get(k).getObject().size(); z++) { int currentMatch = listOfEntitiesPerNgram.get(k).getObject().get(z); int currentCount = 0; if (m.containsKey(currentMatch)) { currentCount = m.get(currentMatch); } else { m.put(currentMatch, 0); } int newValue = currentCount + 1; m.put(currentMatch, newValue); } } List<Integer> listOfM_ = new LinkedList<Integer>(m.keySet()); ArrayList<Integer> listOfM = new ArrayList<Integer>(listOfM_); for (int k = numberOfNgramsForS - min_overlap + 1; k < listOfEntitiesPerNgram.size(); k++) { for (int z = 0; z < listOfM.size(); z++) { int currentZ = listOfM.get(z); // if z is found in the list K if (find(currentZ, listOfEntitiesPerNgram.get(k).getObject())) { m.put(currentZ, m.get(currentZ) + 1); } if (min_overlap <= m.get(currentZ)) { listOfMatches.add(currentZ); } } } return listOfMatches; }