/* * Guess the language of r and returns and array of Language Probability * sorted in decreasing order of probability * * If r is empty, returns all languages with probability of 0.0 * * Maximum maxGrams of r are processed */ public LanguageProbability[] guessLanguages(Reader r, int maxGrams) throws IOException { if (r == null) throw new IllegalArgumentException("Reader r must not be null"); if (maxGrams < 1) throw new IllegalArgumentException("maxGrams must be greater or equal to 1"); SortedSet ss = new TreeSet(); tg.setReader(r); test.clear(); tg.start(maxGrams); Set refset = trigramsmap.keySet(); Iterator it = refset.iterator(); while (it.hasNext()) { Trigrams reference = (Trigrams) it.next(); long distance = test.distance(reference); ss.add(new LanguageProbability((String) trigramsmap.get(reference), distance)); } // Transfer to array LanguageProbability[] lp = new LanguageProbability[ss.size()]; it = ss.iterator(); int i = 0; while (it.hasNext()) { lp[i++] = (LanguageProbability) it.next(); } float minprob = lp[0].probability; // In case of an empty reader, all languages are returned // with a probability of 0.0 if (minprob > 0.0f) { for (i = 0; i < lp.length; i++) { lp[i].probability = minprob / lp[i].probability; } } return lp; }
/* * Guess the language of r and returns its ISO-639 Code * Maximum maxGrams of r are processed */ public String guessLanguage(Reader r, int maxGrams) throws IOException { if (r == null) throw new IllegalArgumentException("Reader r must not be null"); if (maxGrams < 1) throw new IllegalArgumentException("maxGrams must be greater or equal to 1"); Trigrams bestreference = null; tg.setReader(r); test.clear(); tg.start(maxGrams); Set refset = trigramsmap.keySet(); Iterator it = refset.iterator(); long min = Long.MAX_VALUE; while (it.hasNext()) { Trigrams reference = (Trigrams) it.next(); long distance = test.distance(reference); if (distance < min) { bestreference = reference; min = distance; } } return (String) trigramsmap.get(bestreference); }
public TrigramLanguageGuesser(URL[] urls) throws IOException { tg = new TrigramGenerator(); tg.addTrigramListener(test); for (int i = 0; i < urls.length; i++) { URL url = urls[i]; String path = url.getPath(); int fileNameStart = path.lastIndexOf('/') + 1; int fileNameEnd = path.lastIndexOf('.'); String code = path.substring(fileNameStart, fileNameEnd); addFromStream(code, url.openStream()); } }
/* * Construct a LanguageGuesser * fileLocation is a directory containing * xx.tri files where xx is the ISO-639 Language Code * see http://ftp.ics.uci.edu/pub/ietf/http/related/iso639.txt */ public TrigramLanguageGuesser(String fileLocation) throws IOException { tg = new TrigramGenerator(); tg.addTrigramListener(test); File f = new File(fileLocation); if (f.isDirectory()) { String[] files = f.list(); for (int i = 0; i < files.length; i++) { if (files[i].endsWith("tri")) { addFile(f.getAbsolutePath() + File.separator + files[i]); } } } if (trigramsmap.size() == 0) throw new RuntimeException( "Location [ " + f.getAbsolutePath() + " ] doesn't contain any .tri file"); }