/** * Calculate a score how well NGramProfiles match each other * * @param another ngram profile to compare against * @return similarity 0=exact match */ public float getSimilarity(NGramProfile another) { float sum = 0; try { Iterator i = another.getSorted().iterator(); while (i.hasNext()) { NGramEntry other = (NGramEntry) i.next(); if (ngrams.containsKey(other.seq)) { sum += Math.abs((other.frequency - ((NGramEntry) ngrams.get(other.seq)).frequency)) / 2; } else { sum += other.frequency; } } i = getSorted().iterator(); while (i.hasNext()) { NGramEntry other = (NGramEntry) i.next(); if (another.ngrams.containsKey(other.seq)) { sum += Math.abs((other.frequency - ((NGramEntry) another.ngrams.get(other.seq)).frequency)) / 2; } else { sum += other.frequency; } } } catch (final Exception e) { LOGGER.warn(e); } return sum; }
/** * Create a new Language profile from (preferably quite large) text file * * @param name is thename of profile * @param is is the stream to read * @param encoding is the encoding of stream */ public static NGramProfile create(String name, InputStream is, String encoding) { NGramProfile newProfile = new NGramProfile(name, ABSOLUTE_MIN_NGRAM_LENGTH, ABSOLUTE_MAX_NGRAM_LENGTH); BufferedInputStream bis = new BufferedInputStream(is); byte buffer[] = new byte[4096]; StringBuffer text = new StringBuffer(); int len; try { while ((len = bis.read(buffer)) != -1) { text.append(new String(buffer, 0, len, encoding)); } } catch (final IOException e) { LOGGER.warn("Exception raised while creating profile.", e); } newProfile.analyze(text); return newProfile; }
/** * main method used for testing only * * @param args */ public static void main(String args[]) throws Exception { String usage = "Usage: NGramProfile " + "[-create profilename filename encoding] " + "[-similarity file1 file2] " + "[-score profile-name filename encoding]"; int command = 0; final int CREATE = 1; final int SIMILARITY = 2; final int SCORE = 3; String profilename = ""; String filename = ""; String filename2 = ""; String encoding = ""; if (args.length == 0) { System.err.println(usage); System.exit(-1); } for (int i = 0; i < args.length; i++) { // parse command line if (args[i].equals("-create")) { // found -create option command = CREATE; profilename = args[++i]; filename = args[++i]; encoding = args[++i]; } if (args[i].equals("-similarity")) { // found -similarity option command = SIMILARITY; filename = args[++i]; filename2 = args[++i]; encoding = args[++i]; } if (args[i].equals("-score")) { // found -Score option command = SCORE; profilename = args[++i]; filename = args[++i]; encoding = args[++i]; } } switch (command) { case CREATE: File f = new File(filename); FileInputStream fis = new FileInputStream(f); NGramProfile newProfile = NGramProfile.create(profilename, fis, encoding); fis.close(); f = new File(profilename + "." + FILE_EXTENSION); FileOutputStream fos = new FileOutputStream(f); newProfile.save(fos); System.out.println("new profile " + profilename + "." + FILE_EXTENSION + " was created."); break; case SIMILARITY: f = new File(filename); fis = new FileInputStream(f); newProfile = NGramProfile.create(filename, fis, encoding); newProfile.normalize(); f = new File(filename2); fis = new FileInputStream(f); NGramProfile newProfile2 = NGramProfile.create(filename2, fis, encoding); newProfile2.normalize(); System.out.println("Similarity is " + newProfile.getSimilarity(newProfile2)); break; case SCORE: f = new File(filename); fis = new FileInputStream(f); newProfile = NGramProfile.create(filename, fis, encoding); f = new File(profilename + "." + FILE_EXTENSION); fis = new FileInputStream(f); NGramProfile compare = new NGramProfile(profilename, DEFAULT_MIN_NGRAM_LENGTH, DEFAULT_MAX_NGRAM_LENGTH); compare.load(fis); System.out.println("Score is " + compare.getSimilarity(newProfile)); break; } }