/** * Construct an instance of the class with * * @param structureName * @param index * @throws IOException */ @SuppressWarnings("unchecked") public MapFileLexiconIterator(String structureName, Index index) throws IOException { this( structureName, index.getPath(), index.getPrefix(), (FixedSizeWriteableFactory<Text>) index.getIndexStructure(structureName + "-keyfactory"), (FixedSizeWriteableFactory<LexiconEntry>) index.getIndexStructure(structureName + "-valuefactory")); }
/** Construct a new FSOMapFileLexicon */ @SuppressWarnings("unchecked") public FSOMapFileLexicon(String structureName, Index index) throws IOException { this( structureName, index.getPath(), index.getPrefix(), (FixedSizeWriteableFactory<Text>) index.getIndexStructure(structureName + "-keyfactory"), (FixedSizeWriteableFactory<LexiconEntry>) index.getIndexStructure(structureName + "-valuefactory"), index.getIndexProperty("index." + structureName + ".termids", "aligned"), index.getIndexProperty("index." + structureName + ".bsearchshortcut", "default"), index.getIndexProperty("index." + structureName + ".data-source", "file")); }
/** * Does two things to a FSOMapFileLexicon: adds the termid lookup file (if required), and also * creates the lexicon has file. * * @param structureName - name of the index structure that this FSOMapFileLexicon represents * @param index - the index that the index belongs * @throws IOException if an IO problem occurs */ @SuppressWarnings("unchecked") public static void optimise( String structureName, Index index, LexiconBuilder.CollectionStatisticsCounter statsCounter) throws IOException { final String mapFileFilename = constructFilename(structureName, index.getPath(), index.getPrefix(), MAPFILE_EXT); final FixedSizeWriteableFactory<Text> keyFactory = (FixedSizeWriteableFactory<Text>) index.getIndexStructure(structureName + "-keyfactory"); final FixedSizeWriteableFactory<LexiconEntry> valueFactory = (FixedSizeWriteableFactory<LexiconEntry>) index.getIndexStructure(structureName + "-valuefactory"); final int numEntries = FSOrderedMapFile.numberOfEntries(mapFileFilename, keyFactory, valueFactory); optimise(structureName, index, statsCounter, numEntries); }
/** * optimise * * @param structureName * @param index * @param statsCounter * @param numEntries * @throws IOException */ @SuppressWarnings("unchecked") public static void optimise( String structureName, Index index, LexiconBuilder.CollectionStatisticsCounter statsCounter, int numEntries) throws IOException { final String mapFileFilename = constructFilename(structureName, index.getPath(), index.getPrefix(), MAPFILE_EXT); final FixedSizeWriteableFactory<Text> keyFactory = (FixedSizeWriteableFactory<Text>) index.getIndexStructure(structureName + "-keyfactory"); final FixedSizeWriteableFactory<LexiconEntry> valueFactory = (FixedSizeWriteableFactory<LexiconEntry>) index.getIndexStructure(structureName + "-valuefactory"); // logger.info("Optimsing lexicon with "+ numEntries + " entries"); // term id lookups boolean termIdsAligned = true; int[] termid2index = new int[numEntries]; Arrays.fill(termid2index, -1); int counter = 0; int lastTermId = -1; // bsearch reduction int previousFirstChar = -1; int firstChar = 0; final TIntObjectHashMap<int[]> map = new TIntObjectHashMap<int[]>(); Iterator<Map.Entry<Text, LexiconEntry>> iterator = new FSOrderedMapFile.EntryIterator<Text, LexiconEntry>( mapFileFilename, keyFactory, valueFactory); while (iterator.hasNext()) { Map.Entry<Text, LexiconEntry> lee = iterator.next(); // System.err.println(lee.toString()); // term id int termId = lee.getValue().getTermId(); if (!(termId == lastTermId + 1)) termIdsAligned = false; if (termid2index[termId] != -1) { throw new WrappedIOException( new IllegalArgumentException( "Termid " + termId + " is not unique - used at entries " + termid2index[termId] + " and" + counter)); } termid2index[termId] = counter; lastTermId = termId; // bsearch reduction optimisaion firstChar = lee.getKey().charAt(0); if (firstChar != previousFirstChar) { int[] boundaries = new int[] {counter, 0}; map.put(firstChar, boundaries); previousFirstChar = firstChar; } // increments statsCounter.count(lee.getValue()); counter++; } if (counter != numEntries) termIdsAligned = false; IndexUtil.close(iterator); // deal with termids if (termIdsAligned) { index.setIndexProperty("index." + structureName + ".termids", "aligned"); // logger.info("All ids for structure "+structureName+ " are aligned, skipping " // +ID_EXT+ " file"); } else { DataOutputStream dos = new DataOutputStream( Files.writeFileStream( constructFilename(structureName, index.getPath(), index.getPrefix(), ID_EXT))); for (int indexof : termid2index) dos.writeInt(indexof); dos.close(); index.setIndexProperty( "index." + structureName + ".termids", (numEntries > 15000000) ? "file" : "fileinmem"); } int[] mapKeys = map.keys(); Arrays.sort(mapKeys); final int mapKeysSize = mapKeys.length; for (int i = 0; i < mapKeysSize - 1; i++) { int nextLowerBoundary = (map.get(mapKeys[i + 1]))[0]; int[] currentBoundaries = map.get(mapKeys[i]); currentBoundaries[1] = nextLowerBoundary; map.put(mapKeys[i], currentBoundaries); } // do something about the last entry int nextLowerBoundary = counter; int[] currentBoundaries = (int[]) map.get(mapKeys[mapKeysSize - 1]); currentBoundaries[1] = nextLowerBoundary; map.put(mapKeys[mapKeysSize - 1], currentBoundaries); final ObjectOutputStream oos = new ObjectOutputStream( Files.writeFileStream( constructFilename(structureName, index.getPath(), index.getPrefix(), HASH_EXT))); oos.writeObject(map); oos.close(); index.setIndexProperty("index." + structureName + ".bsearchshortcut", "charmap"); index.flush(); }