public NgramLanguageModel( Indexer<String> charIndexer, CountDbBig[] countDbs, Set<Integer> activeCharacters, LMType type, double lmPower) { this.charIndexer = charIndexer; this.countDbs = countDbs; this.maxOrder = countDbs.length; this.type = type; this.lmPower = lmPower; this.allContextsSet = new HashSet<LongArrWrapper>(); this.allContexts = new ArrayList<int[]>(); for (int i = 0; i < this.maxOrder - 1; i++) { for (long[] key : countDbs[i].getKeys()) { if (key != null && countDbs[i].getCount(key, CountType.HISTORY_TYPE_INDEX) > 0) { allContextsSet.add(new LongArrWrapper(key)); allContexts.add(LongNgram.convertToIntArr(key)); } } } if (activeCharacters == null) throw new RuntimeException("activeCharacters is null!"); this.activeCharacters = activeCharacters; }
public void checkNormalizes(int[] context) { double totalProb = 0; for (int i = 0; i < charIndexer.size(); i++) { totalProb += getCharNgramProb(context, i); } System.out.println( "Total prob for context " + LongNgram.toString(context, charIndexer) + ": " + totalProb); }
public boolean containsContext(int[] context) { if (context.length == 0) return true; else return allContextsSet.contains(new LongArrWrapper(LongNgram.convertToLong(context))); }