Example #1
0
  /**
   * Serializes the source, target and feature data structures into interlinked binary files. Target
   * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
   * the linking source trie nodes with the position once it is known. Source and feature data are
   * written simultaneously. The source structure is written into a downward-pointing trie and
   * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
   * prompted to write out a block
   *
   * @param source_trie
   * @param target_trie
   * @param feature_buffer
   * @param id
   * @throws IOException
   */
  private PackingFileTuple flush(
      PackingTrie<SourceValue> source_trie,
      PackingTrie<TargetValue> target_trie,
      FeatureBuffer feature_buffer,
      AlignmentBuffer alignment_buffer,
      int id)
      throws IOException {
    // Make a slice object for this piece of the grammar.
    PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
    // Pull out the streams for source, target and data output.
    DataOutputStream source_stream = slice.getSourceOutput();
    DataOutputStream target_stream = slice.getTargetOutput();
    DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
    DataOutputStream feature_stream = slice.getFeatureOutput();
    DataOutputStream alignment_stream = slice.getAlignmentOutput();

    Queue<PackingTrie<TargetValue>> target_queue;
    Queue<PackingTrie<SourceValue>> source_queue;

    // The number of bytes both written into the source stream and
    // buffered in the source queue.
    int source_position;
    // The number of bytes written into the target stream.
    int target_position;

    // Add trie root into queue, set target position to 0 and set cumulated
    // size to size of trie root.
    target_queue = new LinkedList<PackingTrie<TargetValue>>();
    target_queue.add(target_trie);
    target_position = 0;

    // Target lookup table for trie levels.
    int current_level_size = 1;
    int next_level_size = 0;
    ArrayList<Integer> target_lookup = new ArrayList<Integer>();

    // Packing loop for upwards-pointing target trie.
    while (!target_queue.isEmpty()) {
      // Pop top of queue.
      PackingTrie<TargetValue> node = target_queue.poll();
      // Register that this is where we're writing the node to.
      node.address = target_position;
      // Tell source nodes that we're writing to this position in the file.
      for (TargetValue tv : node.values) tv.parent.target = node.address;
      // Write link to parent.
      if (node.parent != null) target_stream.writeInt(node.parent.address);
      else target_stream.writeInt(-1);
      target_stream.writeInt(node.symbol);
      // Enqueue children.
      for (int k : node.children.descendingKeySet()) {
        PackingTrie<TargetValue> child = node.children.get(k);
        target_queue.add(child);
      }
      target_position += node.size(false, true);
      next_level_size += node.children.descendingKeySet().size();

      current_level_size--;
      if (current_level_size == 0) {
        target_lookup.add(target_position);
        current_level_size = next_level_size;
        next_level_size = 0;
      }
    }
    target_lookup_stream.writeInt(target_lookup.size());
    for (int i : target_lookup) target_lookup_stream.writeInt(i);
    target_lookup_stream.close();

    // Setting up for source and data writing.
    source_queue = new LinkedList<PackingTrie<SourceValue>>();
    source_queue.add(source_trie);
    source_position = source_trie.size(true, false);
    source_trie.address = target_position;

    // Ready data buffers for writing.
    feature_buffer.initialize();
    if (packAlignments) alignment_buffer.initialize();

    // Packing loop for downwards-pointing source trie.
    while (!source_queue.isEmpty()) {
      // Pop top of queue.
      PackingTrie<SourceValue> node = source_queue.poll();
      // Write number of children.
      source_stream.writeInt(node.children.size());
      // Write links to children.
      for (int k : node.children.descendingKeySet()) {
        PackingTrie<SourceValue> child = node.children.get(k);
        // Enqueue child.
        source_queue.add(child);
        // Child's address will be at the current end of the queue.
        child.address = source_position;
        // Advance cumulated size by child's size.
        source_position += child.size(true, false);
        // Write the link.
        source_stream.writeInt(k);
        source_stream.writeInt(child.address);
      }
      // Write number of data items.
      source_stream.writeInt(node.values.size());
      // Write lhs and links to target and data.
      for (SourceValue sv : node.values) {
        int feature_block_index = feature_buffer.write(sv.data);
        if (packAlignments) {
          int alignment_block_index = alignment_buffer.write(sv.data);
          if (alignment_block_index != feature_block_index) {
            logger.severe("Block index mismatch.");
            throw new RuntimeException(
                "Block index mismatch: alignment ("
                    + alignment_block_index
                    + ") and features ("
                    + feature_block_index
                    + ") don't match.");
          }
        }
        source_stream.writeInt(sv.lhs);
        source_stream.writeInt(sv.target);
        source_stream.writeInt(feature_block_index);
      }
    }
    // Flush the data stream.
    feature_buffer.flush(feature_stream);
    if (packAlignments) alignment_buffer.flush(alignment_stream);

    target_stream.close();
    source_stream.close();
    feature_stream.close();
    if (packAlignments) alignment_stream.close();

    return slice;
  }
Example #2
0
  private void binarize(
      LineReader grammar_reader, LineReader alignment_reader, Queue<PackingFileTuple> slices)
      throws IOException {
    int counter = 0;
    int slice_counter = 0;
    int num_slices = 0;

    boolean ready_to_flush = false;
    String first_source_word = null;

    PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
    PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
    FeatureBuffer feature_buffer = new FeatureBuffer();

    AlignmentBuffer alignment_buffer = null;
    if (packAlignments) alignment_buffer = new AlignmentBuffer();

    TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
    while (grammar_reader.hasNext()) {
      String grammar_line = grammar_reader.next().trim();
      counter++;
      slice_counter++;

      String[] fields = grammar_line.split("\\s\\|{3}\\s");
      if (fields.length < 4) {
        logger.warning("Incomplete grammar line at line " + counter);
        continue;
      }
      String lhs_word = fields[0];
      String[] source_words = fields[1].split("\\s");
      String[] target_words = fields[2].split("\\s");
      String[] feature_entries = fields[3].split("\\s");

      // Reached slice limit size, indicate that we're closing up.
      if (!ready_to_flush
          && (slice_counter > SLICE_SIZE
              || feature_buffer.overflowing()
              || (packAlignments && alignment_buffer.overflowing()))) {
        ready_to_flush = true;
        first_source_word = source_words[0];
      }
      // Finished closing up.
      if (ready_to_flush && !first_source_word.equals(source_words[0])) {
        slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices));
        source_trie.clear();
        target_trie.clear();
        feature_buffer.clear();
        if (packAlignments) alignment_buffer.clear();

        num_slices++;
        slice_counter = 0;
        ready_to_flush = false;
      }

      int alignment_index = -1;
      // If present, process alignments.
      if (packAlignments) {
        if (!alignment_reader.hasNext()) {
          logger.severe("No more alignments starting in line " + counter);
          throw new RuntimeException("No more alignments starting in line " + counter);
        } else {
          String alignment_line = alignment_reader.next().trim();
          String[] alignment_entries = alignment_line.split("\\s");
          byte[] alignments = new byte[alignment_entries.length * 2];
          if (alignment_entries.length != 0) {
            for (int i = 0; i < alignment_entries.length; i++) {
              String[] parts = alignment_entries[i].split("-");
              alignments[2 * i] = Byte.parseByte(parts[0]);
              alignments[2 * i + 1] = Byte.parseByte(parts[1]);
            }
          }
          alignment_index = alignment_buffer.add(alignments);
        }
      }

      // Process features.
      // Implicitly sort via TreeMap, write to data buffer, remember position
      // to pass on to the source trie node.
      features.clear();
      for (int f = 0; f < feature_entries.length; ++f) {
        String feature_entry = feature_entries[f];
        if (this.labeled) {
          String[] parts = feature_entry.split("=");
          if (parts[0].equals("Alignment")) continue;
          int feature_id = Vocabulary.id(parts[0]);
          float feature_value = Float.parseFloat(parts[1]);
          if (feature_value != 0) features.put(encoderConfig.innerId(feature_id), feature_value);
        } else {
          float feature_value = Float.parseFloat(feature_entry);
          if (feature_value != 0) features.put(f, feature_value);
        }
      }
      int features_index = feature_buffer.add(features);

      // Sanity check on the data block index.
      if (packAlignments && features_index != alignment_index) {
        logger.severe(
            "Block index mismatch between features ("
                + features_index
                + ") and alignments ("
                + alignment_index
                + ").");
        throw new RuntimeException("Data block index mismatch.");
      }

      // Process source side.
      SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
      int[] source = new int[source_words.length];
      for (int i = 0; i < source_words.length; i++) {
        if (FormatUtils.isNonterminal(source_words[i]))
          source[i] = Vocabulary.id(FormatUtils.stripNt(source_words[i]));
        else source[i] = Vocabulary.id(source_words[i]);
      }
      source_trie.add(source, sv);

      // Process target side.
      TargetValue tv = new TargetValue(sv);
      int[] target = new int[target_words.length];
      for (int i = 0; i < target_words.length; i++) {
        if (FormatUtils.isNonterminal(target_words[i])) {
          target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
        } else {
          target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
        }
      }
      target_trie.add(target, tv);
    }
    slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices));
  }