private void explore(LineReader grammar) { int counter = 0; boolean first_line = true; while (grammar.hasNext()) { String line = grammar.next().trim(); counter++; String[] fields = line.split("\\s\\|{3}\\s"); if (fields.length < 4) { logger.warning("Incomplete grammar line at line " + counter); continue; } String lhs = fields[0]; String[] source = fields[1].split("\\s"); String[] target = fields[2].split("\\s"); String[] features = fields[3].split("\\s"); Vocabulary.id(lhs); // Add symbols to vocabulary. for (String source_word : source) { if (FormatUtils.isNonterminal(source_word)) Vocabulary.id(FormatUtils.stripNt(source_word)); else Vocabulary.id(source_word); } for (String target_word : target) { if (FormatUtils.isNonterminal(target_word)) Vocabulary.id(FormatUtils.stripNt(target_word)); else Vocabulary.id(target_word); } // Test features for labeling. if (first_line && features.length != 0) { if (!features[0].contains("=")) { // We assume that if there is one unlabeled feature the entire grammar is unlabeled. labeled = false; } this.types.setLabeled(labeled); first_line = false; } // Add feature names to vocabulary and pass the value through the // appropriate encoder. for (int f = 0; f < features.length; ++f) { if (labeled) { String[] fe = features[f].split("="); if (fe[0].equals("Alignment")) continue; types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1])); } else { types.observe(f, Float.parseFloat(features[f])); } } } }
private void readConfig(String config_filename) throws IOException { LineReader reader = new LineReader(config_filename); while (reader.hasNext()) { // Clean up line, chop comments off and skip if the result is empty. String line = reader.next().trim(); if (line.indexOf('#') != -1) line = line.substring(0, line.indexOf('#')); if (line.isEmpty()) continue; String[] fields = line.split("[\\s]+"); if (fields.length < 2) { logger.severe("Incomplete line in config."); System.exit(0); } if ("slice_size".equals(fields[0])) { // Number of records to concurrently load into memory for sorting. SLICE_SIZE = Integer.parseInt(fields[1]); } } reader.close(); }
private void binarize( LineReader grammar_reader, LineReader alignment_reader, Queue<PackingFileTuple> slices) throws IOException { int counter = 0; int slice_counter = 0; int num_slices = 0; boolean ready_to_flush = false; String first_source_word = null; PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>(); PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>(); FeatureBuffer feature_buffer = new FeatureBuffer(); AlignmentBuffer alignment_buffer = null; if (packAlignments) alignment_buffer = new AlignmentBuffer(); TreeMap<Integer, Float> features = new TreeMap<Integer, Float>(); while (grammar_reader.hasNext()) { String grammar_line = grammar_reader.next().trim(); counter++; slice_counter++; String[] fields = grammar_line.split("\\s\\|{3}\\s"); if (fields.length < 4) { logger.warning("Incomplete grammar line at line " + counter); continue; } String lhs_word = fields[0]; String[] source_words = fields[1].split("\\s"); String[] target_words = fields[2].split("\\s"); String[] feature_entries = fields[3].split("\\s"); // Reached slice limit size, indicate that we're closing up. if (!ready_to_flush && (slice_counter > SLICE_SIZE || feature_buffer.overflowing() || (packAlignments && alignment_buffer.overflowing()))) { ready_to_flush = true; first_source_word = source_words[0]; } // Finished closing up. if (ready_to_flush && !first_source_word.equals(source_words[0])) { slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices)); source_trie.clear(); target_trie.clear(); feature_buffer.clear(); if (packAlignments) alignment_buffer.clear(); num_slices++; slice_counter = 0; ready_to_flush = false; } int alignment_index = -1; // If present, process alignments. if (packAlignments) { if (!alignment_reader.hasNext()) { logger.severe("No more alignments starting in line " + counter); throw new RuntimeException("No more alignments starting in line " + counter); } else { String alignment_line = alignment_reader.next().trim(); String[] alignment_entries = alignment_line.split("\\s"); byte[] alignments = new byte[alignment_entries.length * 2]; if (alignment_entries.length != 0) { for (int i = 0; i < alignment_entries.length; i++) { String[] parts = alignment_entries[i].split("-"); alignments[2 * i] = Byte.parseByte(parts[0]); alignments[2 * i + 1] = Byte.parseByte(parts[1]); } } alignment_index = alignment_buffer.add(alignments); } } // Process features. // Implicitly sort via TreeMap, write to data buffer, remember position // to pass on to the source trie node. features.clear(); for (int f = 0; f < feature_entries.length; ++f) { String feature_entry = feature_entries[f]; if (this.labeled) { String[] parts = feature_entry.split("="); if (parts[0].equals("Alignment")) continue; int feature_id = Vocabulary.id(parts[0]); float feature_value = Float.parseFloat(parts[1]); if (feature_value != 0) features.put(encoderConfig.innerId(feature_id), feature_value); } else { float feature_value = Float.parseFloat(feature_entry); if (feature_value != 0) features.put(f, feature_value); } } int features_index = feature_buffer.add(features); // Sanity check on the data block index. if (packAlignments && features_index != alignment_index) { logger.severe( "Block index mismatch between features (" + features_index + ") and alignments (" + alignment_index + ")."); throw new RuntimeException("Data block index mismatch."); } // Process source side. SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index); int[] source = new int[source_words.length]; for (int i = 0; i < source_words.length; i++) { if (FormatUtils.isNonterminal(source_words[i])) source[i] = Vocabulary.id(FormatUtils.stripNt(source_words[i])); else source[i] = Vocabulary.id(source_words[i]); } source_trie.add(source, sv); // Process target side. TargetValue tv = new TargetValue(sv); int[] target = new int[target_words.length]; for (int i = 0; i < target_words.length; i++) { if (FormatUtils.isNonterminal(target_words[i])) { target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]); } else { target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]); } } target_trie.add(target, tv); } slices.add(flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices)); }