Пример #1
0
  /**
   * Calculate a score how well NGramProfiles match each other
   *
   * @param another ngram profile to compare against
   * @return similarity 0=exact match
   */
  public float getSimilarity(NGramProfile another) {

    float sum = 0;

    try {
      Iterator i = another.getSorted().iterator();
      while (i.hasNext()) {
        NGramEntry other = (NGramEntry) i.next();
        if (ngrams.containsKey(other.seq)) {
          sum += Math.abs((other.frequency - ((NGramEntry) ngrams.get(other.seq)).frequency)) / 2;
        } else {
          sum += other.frequency;
        }
      }
      i = getSorted().iterator();
      while (i.hasNext()) {
        NGramEntry other = (NGramEntry) i.next();
        if (another.ngrams.containsKey(other.seq)) {
          sum +=
              Math.abs((other.frequency - ((NGramEntry) another.ngrams.get(other.seq)).frequency))
                  / 2;
        } else {
          sum += other.frequency;
        }
      }
    } catch (final Exception e) {
      LOGGER.warn(e);
    }
    return sum;
  }
Пример #2
0
  /**
   * Create a new Language profile from (preferably quite large) text file
   *
   * @param name is thename of profile
   * @param is is the stream to read
   * @param encoding is the encoding of stream
   */
  public static NGramProfile create(String name, InputStream is, String encoding) {

    NGramProfile newProfile =
        new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH);
    BufferedInputStream bis = new BufferedInputStream(is);

    byte buffer[] = new byte[4096];
    StringBuffer text = new StringBuffer();
    int len;

    try {
      while ((len = bis.read(buffer)) != -1) {
        text.append(new String(buffer, 0, len, encoding));
      }
    } catch (final IOException e) {
      LOGGER.warn("Exception raised while creating profile.", e);
    }

    newProfile.analyze(text);
    return newProfile;
  }
Пример #3
0
  /**
   * main method used for testing only
   *
   * @param args
   */
  public static void main(String args[]) throws Exception {

    String usage =
        "Usage: NGramProfile "
            + "[-create profilename filename encoding] "
            + "[-similarity file1 file2] "
            + "[-score profile-name filename encoding]";
    int command = 0;

    final int CREATE = 1;
    final int SIMILARITY = 2;
    final int SCORE = 3;

    String profilename = "";
    String filename = "";
    String filename2 = "";
    String encoding = "";

    if (args.length == 0) {
      System.err.println(usage);
      System.exit(-1);
    }

    for (int i = 0; i < args.length; i++) { // parse command line
      if (args[i].equals("-create")) { // found -create option
        command = CREATE;
        profilename = args[++i];
        filename = args[++i];
        encoding = args[++i];
      }

      if (args[i].equals("-similarity")) { // found -similarity option
        command = SIMILARITY;
        filename = args[++i];
        filename2 = args[++i];
        encoding = args[++i];
      }

      if (args[i].equals("-score")) { // found -Score option
        command = SCORE;
        profilename = args[++i];
        filename = args[++i];
        encoding = args[++i];
      }
    }

    switch (command) {
      case CREATE:
        File f = new File(filename);
        FileInputStream fis = new FileInputStream(f);
        NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding);
        fis.close();
        f = new File(profilename + "." + FILE_EXTENSION);
        FileOutputStream fos = new FileOutputStream(f);
        newProfile.save(fos);
        System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created.");
        break;

      case SIMILARITY:
        f = new File(filename);
        fis = new FileInputStream(f);
        newProfile = NGramProfile.create(filename, fis, encoding);
        newProfile.normalize();

        f = new File(filename2);
        fis = new FileInputStream(f);
        NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding);
        newProfile2.normalize();
        System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2));
        break;

      case SCORE:
        f = new File(filename);
        fis = new FileInputStream(f);
        newProfile = NGramProfile.create(filename, fis, encoding);

        f = new File(profilename + "." + FILE_EXTENSION);
        fis = new FileInputStream(f);
        NGramProfile compare =
            new NGramProfile(profilename, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH);
        compare.load(fis);
        System.out.println("Score is " + compare.getSimilarity(newProfile));
        break;
    }
  }