private long timeAndCheckReadUnigramsAndBigramsBinary( final File file, final List<String> words, final SparseArray<List<Integer>> bigrams, final int bufferType, final boolean checkProbability) { final TreeMap<Integer, String> resultWords = new TreeMap<>(); final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams = new TreeMap<>(); final TreeMap<Integer, Integer> resultFreqs = new TreeMap<>(); long now = -1, diff = -1; try { final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), bufferType); now = System.currentTimeMillis(); dictDecoder.readUnigramsAndBigramsBinary(resultWords, resultFreqs, resultBigrams); diff = System.currentTimeMillis() - now; } catch (IOException e) { Log.e(TAG, "IOException", e); } catch (UnsupportedFormatException e) { Log.e(TAG, "UnsupportedFormatException", e); } checkWordMap(words, bigrams, resultWords, resultFreqs, resultBigrams, checkProbability); return diff; }
@Override public void skipPtNode(final FormatOptions formatOptions) { final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); PtNodeReader.readParentAddress(mDictBuffer, formatOptions); BinaryDictIOUtils.skipString(mDictBuffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0); if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) PtNodeReader.readTerminalId(mDictBuffer); PtNodeReader.readChildrenAddress(mDictBuffer, flags, formatOptions); }
private void checkWordMap( final List<String> expectedWords, final SparseArray<List<Integer>> expectedBigrams, final TreeMap<Integer, String> resultWords, final TreeMap<Integer, Integer> resultFrequencies, final TreeMap<Integer, ArrayList<PendingAttribute>> resultBigrams, final boolean checkProbability) { // check unigrams final Set<String> actualWordsSet = new HashSet<>(resultWords.values()); final Set<String> expectedWordsSet = new HashSet<>(expectedWords); assertEquals(actualWordsSet, expectedWordsSet); if (checkProbability) { for (int freq : resultFrequencies.values()) { assertEquals(freq, UNIGRAM_FREQ); } } // check bigrams final HashMap<String, Set<String>> expBigrams = new HashMap<>(); for (int i = 0; i < expectedBigrams.size(); ++i) { final String word1 = expectedWords.get(expectedBigrams.keyAt(i)); for (int w2 : expectedBigrams.valueAt(i)) { if (expBigrams.get(word1) == null) { expBigrams.put(word1, new HashSet<String>()); } expBigrams.get(word1).add(expectedWords.get(w2)); } } final HashMap<String, Set<String>> actBigrams = new HashMap<>(); for (Entry<Integer, ArrayList<PendingAttribute>> entry : resultBigrams.entrySet()) { final String word1 = resultWords.get(entry.getKey()); final int unigramFreq = resultFrequencies.get(entry.getKey()); for (PendingAttribute attr : entry.getValue()) { final String word2 = resultWords.get(attr.mAddress); if (actBigrams.get(word1) == null) { actBigrams.put(word1, new HashSet<String>()); } actBigrams.get(word1).add(word2); if (checkProbability) { final int bigramFreq = BinaryDictIOUtils.reconstructBigramFrequency(unigramFreq, attr.mFrequency); assertTrue(Math.abs(bigramFreq - BIGRAM_FREQ) < TOLERANCE_OF_BIGRAM_FREQ); } } } assertEquals(actBigrams, expBigrams); }
private long timeReadingAndCheckDict( final File file, final List<String> words, final SparseArray<List<Integer>> bigrams, final HashMap<String, List<String>> shortcutMap, final int bufferType) { long now, diff = -1; FusionDictionary dict = null; try { final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), bufferType); now = System.currentTimeMillis(); dict = dictDecoder.readDictionaryBinary(false /* deleteDictIfBroken */); diff = System.currentTimeMillis() - now; } catch (IOException e) { Log.e(TAG, "IOException while reading dictionary", e); } catch (UnsupportedFormatException e) { Log.e(TAG, "Unsupported format", e); } checkDictionary(dict, words, bigrams, shortcutMap); return diff; }
@Override public PtNodeInfo readPtNode(int ptNodePos, FormatOptions options) { int addressPointer = ptNodePos; final int flags = PtNodeReader.readPtNodeOptionFlags(mDictBuffer); addressPointer += FormatSpec.PTNODE_FLAGS_SIZE; final int parentAddress = PtNodeReader.readParentAddress(mDictBuffer, options); if (BinaryDictIOUtils.supportsDynamicUpdate(options)) { addressPointer += FormatSpec.PARENT_ADDRESS_SIZE; } final int characters[]; if (0 != (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS)) { int index = 0; int character = CharEncoding.readChar(mDictBuffer); addressPointer += CharEncoding.getCharSize(character); while (FormatSpec.INVALID_CHARACTER != character && index < FormatSpec.MAX_WORD_LENGTH) { mCharacterBuffer[index++] = character; character = CharEncoding.readChar(mDictBuffer); addressPointer += CharEncoding.getCharSize(character); } characters = Arrays.copyOfRange(mCharacterBuffer, 0, index); } else { final int character = CharEncoding.readChar(mDictBuffer); addressPointer += CharEncoding.getCharSize(character); characters = new int[] {character}; } final int terminalId; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { terminalId = PtNodeReader.readTerminalId(mDictBuffer); addressPointer += FormatSpec.PTNODE_TERMINAL_ID_SIZE; } else { terminalId = PtNode.NOT_A_TERMINAL; } final int frequency; if (0 != (FormatSpec.FLAG_IS_TERMINAL & flags)) { frequency = PtNodeReader.readFrequency(mFrequencyBuffer, terminalId); } else { frequency = PtNode.NOT_A_TERMINAL; } int childrenAddress = PtNodeReader.readChildrenAddress(mDictBuffer, flags, options); if (childrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) { childrenAddress += addressPointer; } addressPointer += BinaryDictIOUtils.getChildrenAddressSize(flags, options); final ArrayList<WeightedString> shortcutTargets = readShortcuts(terminalId); final ArrayList<PendingAttribute> bigrams; if (0 != (flags & FormatSpec.FLAG_HAS_BIGRAMS)) { bigrams = new ArrayList<PendingAttribute>(); final int posOfBigrams = mBigramAddressTable.get(0 /* contentTableIndex */, terminalId); mBigramBuffer.position(posOfBigrams); while (bigrams.size() < FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { // If bigrams.size() reaches FormatSpec.MAX_BIGRAMS_IN_A_PTNODE, // remaining bigram entries are ignored. final int bigramFlags = mBigramBuffer.readUnsignedByte(); final int targetTerminalId = mBigramBuffer.readUnsignedInt24(); mTerminalAddressTableBuffer.position( targetTerminalId * FormatSpec.TERMINAL_ADDRESS_TABLE_ADDRESS_SIZE); final int targetAddress = mTerminalAddressTableBuffer.readUnsignedInt24(); bigrams.add( new PendingAttribute( bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY, targetAddress)); if (0 == (bigramFlags & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT)) break; } if (bigrams.size() >= FormatSpec.MAX_BIGRAMS_IN_A_PTNODE) { throw new RuntimeException( "Too many bigrams in a PtNode (" + bigrams.size() + " but max is " + FormatSpec.MAX_BIGRAMS_IN_A_PTNODE + ")"); } } else { bigrams = null; } return new PtNodeInfo( ptNodePos, addressPointer, flags, characters, frequency, parentAddress, childrenAddress, shortcutTargets, bigrams); }
private void runGetTerminalPosition( final ArrayList<String> words, final SparseArray<List<Integer>> bigrams, final int bufferType, final FormatOptions formatOptions, final String message) { final String dictName = "testGetTerminalPosition"; final String dictVersion = Long.toString(System.currentTimeMillis()); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */); addBigrams(dict, words, bigrams); timeWritingDictToFile(file, dict, formatOptions); final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), DictDecoder.USE_BYTEARRAY); try { dictDecoder.openDictBuffer(); } catch (IOException e) { Log.e(TAG, "IOException while opening the buffer", e); } catch (UnsupportedFormatException e) { Log.e(TAG, "IOException while opening the buffer", e); } assertTrue("Can't get the buffer", dictDecoder.isDictBufferOpen()); try { // too long word final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(longWord)); // null assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(null)); // empty string assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition("")); } catch (IOException e) { } catch (UnsupportedFormatException e) { } // Test a word that is contained within the dictionary. long sum = 0; for (int i = 0; i < sWords.size(); ++i) { final long time = checkGetTerminalPosition(dictDecoder, sWords.get(i), true); sum += time == -1 ? 0 : time; } Log.d( TAG, "per search : " + (((double) sum) / sWords.size() / 1000000) + " : " + message + " : " + outputOptions(bufferType, formatOptions)); // Test a word that isn't contained within the dictionary. final Random random = new Random((int) System.currentTimeMillis()); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); for (int i = 0; i < 1000; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); if (sWords.indexOf(word) != -1) continue; checkGetTerminalPosition(dictDecoder, word, false); } }