// Tests for readDictionaryBinary and writeDictionaryBinary private String runReadAndWrite( final List<String> words, final SparseArray<List<Integer>> bigrams, final HashMap<String, List<String>> shortcuts, final int bufferType, final FormatSpec.FormatOptions formatOptions, final String message) { final String dictName = "runReadAndWrite"; final String dictVersion = Long.toString(System.currentTimeMillis()); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(words.size(), dict, words, shortcuts); addBigrams(dict, words, bigrams); checkDictionary(dict, words, bigrams, shortcuts); final long write = timeWritingDictToFile(file, dict, formatOptions); final long read = timeReadingAndCheckDict(file, words, bigrams, shortcuts, bufferType); return "PROF: read=" + read + "ms, write=" + write + "ms :" + message + " : " + outputOptions(bufferType, formatOptions); }
public void testVer2DictGetWordProperty() { final FormatOptions formatOptions = BinaryDictUtils.VERSION2_OPTIONS; final ArrayList<String> words = sWords; final HashMap<String, List<String>> shortcuts = sShortcuts; final String dictName = "testGetWordProperty"; final String dictVersion = Long.toString(System.currentTimeMillis()); final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(words.size(), dict, words, shortcuts); addBigrams(dict, words, sEmptyBigrams); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); file.delete(); timeWritingDictToFile(file, dict, formatOptions); final BinaryDictionary binaryDictionary = new BinaryDictionary( file.getAbsolutePath(), 0 /* offset */, file.length(), true /* useFullEditDistance */, Locale.ENGLISH, dictName, false /* isUpdatable */); for (final String word : words) { final WordProperty wordProperty = binaryDictionary.getWordProperty(word, false /* isBeginningOfSentence */); assertEquals(word, wordProperty.mWord); assertEquals(UNIGRAM_FREQ, wordProperty.getProbability()); if (shortcuts.containsKey(word)) { assertEquals(shortcuts.get(word).size(), wordProperty.mShortcutTargets.size()); final List<String> shortcutList = shortcuts.get(word); assertTrue(wordProperty.mHasShortcuts); for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { assertTrue(shortcutList.contains(shortcutTarget.mWord)); assertEquals(UNIGRAM_FREQ, shortcutTarget.getProbability()); shortcutList.remove(shortcutTarget.mWord); } assertTrue(shortcutList.isEmpty()); } } }
private String runReadUnigramsAndBigramsBinary( final ArrayList<String> words, final SparseArray<List<Integer>> bigrams, final int bufferType, final FormatSpec.FormatOptions formatOptions, final String message) { final String dictName = "runReadUnigrams"; final String dictVersion = Long.toString(System.currentTimeMillis()); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); // making the dictionary from lists of words. final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(words.size(), dict, words, null /* shortcutMap */); addBigrams(dict, words, bigrams); timeWritingDictToFile(file, dict, formatOptions); // Caveat: Currently, the Java code to read a v4 dictionary doesn't calculate the // probability when there's a timestamp for the entry. // TODO: Abandon the Java code, and implement the v4 dictionary reading code in native. long wordMap = timeAndCheckReadUnigramsAndBigramsBinary( file, words, bigrams, bufferType, !formatOptions.mHasTimestamp /* checkProbability */); long fullReading = timeReadingAndCheckDict(file, words, bigrams, null /* shortcutMap */, bufferType); return "readDictionaryBinary=" + fullReading + ", readUnigramsAndBigramsBinary=" + wordMap + " : " + message + " : " + outputOptions(bufferType, formatOptions); }
private long timeWritingDictToFile( final File file, final FusionDictionary dict, final FormatSpec.FormatOptions formatOptions) { long now = -1, diff = -1; try { final DictEncoder dictEncoder = BinaryDictUtils.getDictEncoder(file, formatOptions); now = System.currentTimeMillis(); // If you need to dump the dict to a textual file, uncomment the line below and the // function above // dumpToCombinedFileForDebug(file, "/tmp/foo"); dictEncoder.writeDictionary(dict, formatOptions); diff = System.currentTimeMillis() - now; } catch (IOException e) { Log.e(TAG, "IO exception while writing file", e); } catch (UnsupportedFormatException e) { Log.e(TAG, "UnsupportedFormatException", e); } return diff; }
public void testVer2DictIteration() { final FormatOptions formatOptions = BinaryDictUtils.VERSION2_OPTIONS; final ArrayList<String> words = sWords; final HashMap<String, List<String>> shortcuts = sShortcuts; final SparseArray<List<Integer>> bigrams = sEmptyBigrams; final String dictName = "testGetWordProperty"; final String dictVersion = Long.toString(System.currentTimeMillis()); final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(words.size(), dict, words, shortcuts); addBigrams(dict, words, bigrams); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); timeWritingDictToFile(file, dict, formatOptions); Log.d(TAG, file.getAbsolutePath()); final BinaryDictionary binaryDictionary = new BinaryDictionary( file.getAbsolutePath(), 0 /* offset */, file.length(), true /* useFullEditDistance */, Locale.ENGLISH, dictName, false /* isUpdatable */); final HashSet<String> wordSet = new HashSet<>(words); final HashSet<Pair<String, String>> bigramSet = new HashSet<>(); for (int i = 0; i < words.size(); i++) { final List<Integer> bigramList = bigrams.get(i); if (bigramList != null) { for (final Integer word1Index : bigramList) { final String word1 = words.get(word1Index); bigramSet.add(new Pair<>(words.get(i), word1)); } } } int token = 0; do { final BinaryDictionary.GetNextWordPropertyResult result = binaryDictionary.getNextWordProperty(token); final WordProperty wordProperty = result.mWordProperty; final String word0 = wordProperty.mWord; assertEquals(UNIGRAM_FREQ, wordProperty.mProbabilityInfo.mProbability); wordSet.remove(word0); if (shortcuts.containsKey(word0)) { assertEquals(shortcuts.get(word0).size(), wordProperty.mShortcutTargets.size()); final List<String> shortcutList = shortcuts.get(word0); assertNotNull(wordProperty.mShortcutTargets); for (final WeightedString shortcutTarget : wordProperty.mShortcutTargets) { assertTrue(shortcutList.contains(shortcutTarget.mWord)); assertEquals(UNIGRAM_FREQ, shortcutTarget.getProbability()); shortcutList.remove(shortcutTarget.mWord); } assertTrue(shortcutList.isEmpty()); } for (int j = 0; j < wordProperty.mBigrams.size(); j++) { final String word1 = wordProperty.mBigrams.get(j).mWord; final Pair<String, String> bigram = new Pair<>(word0, word1); assertTrue(bigramSet.contains(bigram)); bigramSet.remove(bigram); } token = result.mNextToken; } while (token != 0); assertTrue(wordSet.isEmpty()); assertTrue(bigramSet.isEmpty()); }
private void runGetTerminalPosition( final ArrayList<String> words, final SparseArray<List<Integer>> bigrams, final int bufferType, final FormatOptions formatOptions, final String message) { final String dictName = "testGetTerminalPosition"; final String dictVersion = Long.toString(System.currentTimeMillis()); final File file = BinaryDictUtils.getDictFile( dictName, dictVersion, formatOptions, getContext().getCacheDir()); final FusionDictionary dict = new FusionDictionary( new PtNodeArray(), BinaryDictUtils.makeDictionaryOptions(dictName, dictVersion, formatOptions)); addUnigrams(sWords.size(), dict, sWords, null /* shortcutMap */); addBigrams(dict, words, bigrams); timeWritingDictToFile(file, dict, formatOptions); final DictDecoder dictDecoder = BinaryDictIOUtils.getDictDecoder(file, 0, file.length(), DictDecoder.USE_BYTEARRAY); try { dictDecoder.openDictBuffer(); } catch (IOException e) { Log.e(TAG, "IOException while opening the buffer", e); } catch (UnsupportedFormatException e) { Log.e(TAG, "IOException while opening the buffer", e); } assertTrue("Can't get the buffer", dictDecoder.isDictBufferOpen()); try { // too long word final String longWord = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(longWord)); // null assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition(null)); // empty string assertEquals(FormatSpec.NOT_VALID_WORD, dictDecoder.getTerminalPosition("")); } catch (IOException e) { } catch (UnsupportedFormatException e) { } // Test a word that is contained within the dictionary. long sum = 0; for (int i = 0; i < sWords.size(); ++i) { final long time = checkGetTerminalPosition(dictDecoder, sWords.get(i), true); sum += time == -1 ? 0 : time; } Log.d( TAG, "per search : " + (((double) sum) / sWords.size() / 1000000) + " : " + message + " : " + outputOptions(bufferType, formatOptions)); // Test a word that isn't contained within the dictionary. final Random random = new Random((int) System.currentTimeMillis()); final int[] codePointSet = CodePointUtils.generateCodePointSet(DEFAULT_CODE_POINT_SET_SIZE, random); for (int i = 0; i < 1000; ++i) { final String word = CodePointUtils.generateWord(random, codePointSet); if (sWords.indexOf(word) != -1) continue; checkGetTerminalPosition(dictDecoder, word, false); } }