private void add(int[] path, int index, D value) { if (index == path.length) this.values.add(value); else { PackingTrie<D> child = children.get(path[index]); if (child == null) { child = new PackingTrie<D>(this, path[index]); children.put(path[index], child); } child.add(path, index + 1, value); } }
/** * Serializes the source, target and feature data structures into interlinked binary files. Target * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating * the linking source trie nodes with the position once it is known. Source and feature data are * written simultaneously. The source structure is written into a downward-pointing trie and * stores the rule's lhs as well as links to the target and feature stream. The feature stream is * prompted to write out a block * * @param source_trie * @param target_trie * @param feature_buffer * @param id * @throws IOException */ private PackingFileTuple flush( PackingTrie<SourceValue> source_trie, PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer, AlignmentBuffer alignment_buffer, int id) throws IOException { // Make a slice object for this piece of the grammar. PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id)); // Pull out the streams for source, target and data output. DataOutputStream source_stream = slice.getSourceOutput(); DataOutputStream target_stream = slice.getTargetOutput(); DataOutputStream target_lookup_stream = slice.getTargetLookupOutput(); DataOutputStream feature_stream = slice.getFeatureOutput(); DataOutputStream alignment_stream = slice.getAlignmentOutput(); Queue<PackingTrie<TargetValue>> target_queue; Queue<PackingTrie<SourceValue>> source_queue; // The number of bytes both written into the source stream and // buffered in the source queue. int source_position; // The number of bytes written into the target stream. int target_position; // Add trie root into queue, set target position to 0 and set cumulated // size to size of trie root. target_queue = new LinkedList<PackingTrie<TargetValue>>(); target_queue.add(target_trie); target_position = 0; // Target lookup table for trie levels. int current_level_size = 1; int next_level_size = 0; ArrayList<Integer> target_lookup = new ArrayList<Integer>(); // Packing loop for upwards-pointing target trie. while (!target_queue.isEmpty()) { // Pop top of queue. PackingTrie<TargetValue> node = target_queue.poll(); // Register that this is where we're writing the node to. node.address = target_position; // Tell source nodes that we're writing to this position in the file. for (TargetValue tv : node.values) tv.parent.target = node.address; // Write link to parent. if (node.parent != null) target_stream.writeInt(node.parent.address); else target_stream.writeInt(-1); target_stream.writeInt(node.symbol); // Enqueue children. for (int k : node.children.descendingKeySet()) { PackingTrie<TargetValue> child = node.children.get(k); target_queue.add(child); } target_position += node.size(false, true); next_level_size += node.children.descendingKeySet().size(); current_level_size--; if (current_level_size == 0) { target_lookup.add(target_position); current_level_size = next_level_size; next_level_size = 0; } } target_lookup_stream.writeInt(target_lookup.size()); for (int i : target_lookup) target_lookup_stream.writeInt(i); target_lookup_stream.close(); // Setting up for source and data writing. source_queue = new LinkedList<PackingTrie<SourceValue>>(); source_queue.add(source_trie); source_position = source_trie.size(true, false); source_trie.address = target_position; // Ready data buffers for writing. feature_buffer.initialize(); if (packAlignments) alignment_buffer.initialize(); // Packing loop for downwards-pointing source trie. while (!source_queue.isEmpty()) { // Pop top of queue. PackingTrie<SourceValue> node = source_queue.poll(); // Write number of children. source_stream.writeInt(node.children.size()); // Write links to children. for (int k : node.children.descendingKeySet()) { PackingTrie<SourceValue> child = node.children.get(k); // Enqueue child. source_queue.add(child); // Child's address will be at the current end of the queue. child.address = source_position; // Advance cumulated size by child's size. source_position += child.size(true, false); // Write the link. source_stream.writeInt(k); source_stream.writeInt(child.address); } // Write number of data items. source_stream.writeInt(node.values.size()); // Write lhs and links to target and data. for (SourceValue sv : node.values) { int feature_block_index = feature_buffer.write(sv.data); if (packAlignments) { int alignment_block_index = alignment_buffer.write(sv.data); if (alignment_block_index != feature_block_index) { logger.severe("Block index mismatch."); throw new RuntimeException( "Block index mismatch: alignment (" + alignment_block_index + ") and features (" + feature_block_index + ") don't match."); } } source_stream.writeInt(sv.lhs); source_stream.writeInt(sv.target); source_stream.writeInt(feature_block_index); } } // Flush the data stream. feature_buffer.flush(feature_stream); if (packAlignments) alignment_buffer.flush(alignment_stream); target_stream.close(); source_stream.close(); feature_stream.close(); if (packAlignments) alignment_stream.close(); return slice; }
private void binarize( LineReader grammar_reader, LineReader alignment_reader, Queue<PackingFileTuple> slices) throws IOException { int counter = 0; int slice_counter = 0; int num_slices = 0; boolean ready_to_flush = false; String first_source_word = null; PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>(); PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>(); FeatureBuffer feature_buffer = new FeatureBuffer(); AlignmentBuffer alignment_buffer = null; if (packAlignments) alignment_buffer = new AlignmentBuffer(); TreeMap<Integer, Float> features = new TreeMap<Integer, Float>(); while (grammar_reader.hasNext()) { String grammar_line = grammar_reader.next().trim(); counter++; slice_counter++; String[] fields = grammar_line.split("\\s\\|{3}\\s"); if (fields.length < 4) { logger.warning("Incomplete grammar line at line " + counter); continue; } String lhs_word = fields[0]; String[] source_words = fields[1].split("\\s"); String[] target_words = fields[2].split("\\s"); String[] feature_entries = fields[3].split("\\s"); // Reached slice limit size, indicate that we're closing up. if (!ready_to_flush && (slice_counter > SLICE_SIZE || feature_buffer.overflowing() || (packAlignments && alignment_buffer.overflowing()))) { ready_to_flush = true; first_source_word = source_words[0]; } // Finished closing up. if (ready_to_flush && !first_source_word.equals(source_words[0])) { slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices)); source_trie.clear(); target_trie.clear(); feature_buffer.clear(); if (packAlignments) alignment_buffer.clear(); num_slices++; slice_counter = 0; ready_to_flush = false; } int alignment_index = -1; // If present, process alignments. if (packAlignments) { if (!alignment_reader.hasNext()) { logger.severe("No more alignments starting in line " + counter); throw new RuntimeException("No more alignments starting in line " + counter); } else { String alignment_line = alignment_reader.next().trim(); String[] alignment_entries = alignment_line.split("\\s"); byte[] alignments = new byte[alignment_entries.length * 2]; if (alignment_entries.length != 0) { for (int i = 0; i < alignment_entries.length; i++) { String[] parts = alignment_entries[i].split("-"); alignments[2 * i] = Byte.parseByte(parts[0]); alignments[2 * i + 1] = Byte.parseByte(parts[1]); } } alignment_index = alignment_buffer.add(alignments); } } // Process features. // Implicitly sort via TreeMap, write to data buffer, remember position // to pass on to the source trie node. features.clear(); for (int f = 0; f < feature_entries.length; ++f) { String feature_entry = feature_entries[f]; if (this.labeled) { String[] parts = feature_entry.split("="); if (parts[0].equals("Alignment")) continue; int feature_id = Vocabulary.id(parts[0]); float feature_value = Float.parseFloat(parts[1]); if (feature_value != 0) features.put(encoderConfig.innerId(feature_id), feature_value); } else { float feature_value = Float.parseFloat(feature_entry); if (feature_value != 0) features.put(f, feature_value); } } int features_index = feature_buffer.add(features); // Sanity check on the data block index. if (packAlignments && features_index != alignment_index) { logger.severe( "Block index mismatch between features (" + features_index + ") and alignments (" + alignment_index + ")."); throw new RuntimeException("Data block index mismatch."); } // Process source side. SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index); int[] source = new int[source_words.length]; for (int i = 0; i < source_words.length; i++) { if (FormatUtils.isNonterminal(source_words[i])) source[i] = Vocabulary.id(FormatUtils.stripNt(source_words[i])); else source[i] = Vocabulary.id(source_words[i]); } source_trie.add(source, sv); // Process target side. TargetValue tv = new TargetValue(sv); int[] target = new int[target_words.length]; for (int i = 0; i < target_words.length; i++) { if (FormatUtils.isNonterminal(target_words[i])) { target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]); } else { target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]); } } target_trie.add(target, tv); } slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices)); }