Beispiel #1
0
  /*
   * THIS IS NOT LONGER USED was just implemented in the first VERSION
   * Implementation of the inefficient algorithm given in the paper
   * Searches for matches of size l for string S, whose minmum overlap of ngrams is min_overlap, in the dictionary d
   * @param s string we are looking for
   * @param min_overlap min number of matching ngrams between s and Y, for Y to be a match for S
   * @param d dictionary
   * @param l size of Y, given that Y is a match for S
   * @return the list of Ids which are matches to S of size L and whose min number of common ngrams with S is min_overlap
   */
  private static ArrayList<Integer> overlap(
      String s, int min_overlap, DictionaryGenerator d, int l) {
    // list of matches to return at the end
    ArrayList<Integer> listOfMatches = new ArrayList<Integer>();

    // list of ngrams of the string S
    ArrayList<String> ngramList = Ngram.splitIntoNGrams(s, d.getNgramSize());

    // count of how many times a string has matched, in order to make sure the overlap is higher
    // than min_overlap
    HashMap<Integer, Integer> m = new HashMap<Integer, Integer>();

    for (int i = 0; i < ngramList.size(); i++) {

      String ngram = ngramList.get(i);
      ArrayList<Integer> matches_ = d.searchTerm(l, ngram);
      System.err.println(
          "looking for matches of ngram: "
              + ngramList.get(i)
              + " in strings of size: "
              + l
              + " --Matches:"
              + matches_.size());

      HashSet<Integer> matches = new HashSet<Integer>();
      for (int j = 0; j < matches_.size(); j++) {
        matches.add(matches_.get(j));
      }

      Iterator it = matches.iterator();

      while (it.hasNext()) {

        Integer currentMatch = (Integer) it.next();

        int currentCount = 0;
        if (m.containsKey(currentMatch)) {
          currentCount = m.get(currentMatch);
        } else {
          m.put(currentMatch, 0);
        }
        int newValue = currentCount + 1;
        m.put(currentMatch, newValue);

        if (min_overlap <= newValue) {
          System.err.println("match:" + currentMatch + "--" + newValue);
          listOfMatches.add(currentMatch);
        }
      }
    }

    return listOfMatches;
  }
Beispiel #2
0
  public static void search(DictionaryGenerator d, Measure m, String s) {

    // preprocessing S(adding extra characters as described in the paper)
    s = Ngram.preprocessTerm(s, d.getNgramSize());

    // looking for the range in which strings have to be looked up
    int max = m.max(s, d.getNgramSize());
    int min = m.min(s, d.getNgramSize());

    // just debugging
    System.err.println("------------- ");
    System.err.println("searching: " + s);
    System.err.println("range: [" + min + "," + max + "]");

    // set of matches
    HashSet<Integer> solutions = new HashSet<Integer>();

    // iterate through all the posible sizes
    for (int l = min; l <= max; l++) {
      // min overlap called t in the paper
      int min_overlap = m.t(s, d.getNgramSize(), l);
      System.err.println("min_overlap:" + min_overlap);

      // look for solutions for strings with size l
      ArrayList<Integer> matches = overlap_nonnaive(s, min_overlap, d, l);

      // add the found solutions to the set of solutions
      for (int i = 0; i < matches.size(); i++) {
        solutions.add(matches.get(i));
      }
    }

    // decode solutions, since they are used with id's a translation to strings have to be done :)
    System.err.println("Result:");
    Iterator sol = solutions.iterator();
    while (sol.hasNext()) {

      int id = (Integer) sol.next();
      System.err.println("\t " + d.getTerm(id));
    }
  }
Beispiel #3
0
  /*
   * Implementation of the efficient algorithm given in the paper
   * Searches for matches of size l for string S, whose minmum overlap of ngrams is min_overlap, in the dictionary d
   * @param s string we are looking for
   * @param min_overlap min number of matching ngrams between s and Y, for Y to be a match for S
   * @param d dictionary
   * @param l size of Y, given that Y is a match for S
   * @return the list of Ids which are matches to S of size L and whose min number of common ngrams with S is min_overlap
   */
  private static ArrayList<Integer> overlap_nonnaive(
      String s, int min_overlap, DictionaryGenerator d, int l) {
    // list of matches to return at the end
    ArrayList<Integer> listOfMatches = new ArrayList<Integer>();

    // list of ngrams of the string S
    ArrayList<String> ngramList = Ngram.splitIntoNGrams(s, d.getNgramSize());

    // count of how many times a string has matched, in order to make sure the overlap is higher
    // than min_overlap
    HashMap<Integer, Integer> m = new HashMap<Integer, Integer>();

    // order listOfNgrams increnmentally to the least common ngram to the most common
    PriorityQueue queueEntitiesPerNgram = new PriorityQueue();

    for (int i = 0; i < ngramList.size(); i++) {
      ArrayList<Integer> tempListOfTermsForNgramI = d.searchTerm(l, ngramList.get(i));
      Orderable<ArrayList<Integer>> orderable_temp =
          new Orderable<ArrayList<Integer>>(
              tempListOfTermsForNgramI, i, tempListOfTermsForNgramI.size());
      queueEntitiesPerNgram.add(orderable_temp);
    }

    List<Orderable<ArrayList<Integer>>> queueIntoList =
        new LinkedList<Orderable<ArrayList<Integer>>>(queueEntitiesPerNgram);
    ArrayList<Orderable<ArrayList<Integer>>> listOfEntitiesPerNgram =
        new ArrayList<Orderable<ArrayList<Integer>>>(queueIntoList);

    int numberOfNgramsForS = Ngram.getNumberOfNgrams(s, d.getNgramSize());

    for (int k = 0; k <= numberOfNgramsForS - min_overlap; k++) {
      for (int z = 0; z < listOfEntitiesPerNgram.get(k).getObject().size(); z++) {
        int currentMatch = listOfEntitiesPerNgram.get(k).getObject().get(z);
        int currentCount = 0;
        if (m.containsKey(currentMatch)) {
          currentCount = m.get(currentMatch);
        } else {
          m.put(currentMatch, 0);
        }
        int newValue = currentCount + 1;
        m.put(currentMatch, newValue);
      }
    }

    List<Integer> listOfM_ = new LinkedList<Integer>(m.keySet());
    ArrayList<Integer> listOfM = new ArrayList<Integer>(listOfM_);

    for (int k = numberOfNgramsForS - min_overlap + 1; k < listOfEntitiesPerNgram.size(); k++) {
      for (int z = 0; z < listOfM.size(); z++) {
        int currentZ = listOfM.get(z);

        // if z is found in the list K
        if (find(currentZ, listOfEntitiesPerNgram.get(k).getObject())) {
          m.put(currentZ, m.get(currentZ) + 1);
        }
        if (min_overlap <= m.get(currentZ)) {
          listOfMatches.add(currentZ);
        }
      }
    }

    return listOfMatches;
  }