private void explore(LineReader grammar) { int counter = 0; boolean first_line = true; while (grammar.hasNext()) { String line = grammar.next().trim(); counter++; String[] fields = line.split("\\s\\|{3}\\s"); if (fields.length < 4) { logger.warning("Incomplete grammar line at line " + counter); continue; } String lhs = fields[0]; String[] source = fields[1].split("\\s"); String[] target = fields[2].split("\\s"); String[] features = fields[3].split("\\s"); Vocabulary.id(lhs); // Add symbols to vocabulary. for (String source_word : source) { if (FormatUtils.isNonterminal(source_word)) Vocabulary.id(FormatUtils.stripNt(source_word)); else Vocabulary.id(source_word); } for (String target_word : target) { if (FormatUtils.isNonterminal(target_word)) Vocabulary.id(FormatUtils.stripNt(target_word)); else Vocabulary.id(target_word); } // Test features for labeling. if (first_line && features.length != 0) { if (!features[0].contains("=")) { // We assume that if there is one unlabeled feature the entire grammar is unlabeled. labeled = false; } this.types.setLabeled(labeled); first_line = false; } // Add feature names to vocabulary and pass the value through the // appropriate encoder. for (int f = 0; f < features.length; ++f) { if (labeled) { String[] fe = features[f].split("="); if (fe[0].equals("Alignment")) continue; types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1])); } else { types.observe(f, Float.parseFloat(features[f])); } } } }
/** * Executes the packing. * * @throws IOException */ public void pack() throws IOException { logger.info("Beginning exploration pass."); LineReader grammar_reader = null; LineReader alignment_reader = null; // Explore pass. Learn vocabulary and feature value histograms. logger.info("Exploring: " + grammar); grammar_reader = new LineReader(grammar); explore(grammar_reader); logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders."); if (dump != null) { PrintWriter dump_writer = new PrintWriter(dump); dump_writer.println(types.toString()); dump_writer.close(); } types.inferTypes(this.labeled); logger.info("Type inference complete."); logger.info("Finalizing encoding."); logger.info("Writing encoding."); types.write(output + File.separator + "encoding"); logger.info("Freezing vocab."); Vocabulary.freeze(); logger.info("Writing vocab."); Vocabulary.write(output + File.separator + "vocabulary"); // Read previously written encoder configuration to match up to changed // vocabulary id's. logger.info("Reading encoding."); encoderConfig = new EncoderConfiguration(); encoderConfig.load(output + File.separator + "encoding"); logger.info("Beginning packing pass."); Queue<PackingFileTuple> slices = new PriorityQueue<PackingFileTuple>(); // Actual binarization pass. Slice and pack source, target and data. grammar_reader = new LineReader(grammar); if (packAlignments) alignment_reader = new LineReader(alignments); binarize(grammar_reader, alignment_reader, slices); logger.info("Packing complete."); logger.info("Packed grammar in: " + output); logger.info("Done."); }
public GrammarPacker( String grammar_filename, String config_filename, String output_filename, String alignments_filename, String featuredump_filename) throws IOException { this.labeled = true; this.grammar = grammar_filename; this.output = output_filename; this.dump = featuredump_filename; // TODO: Always open encoder config? This is debatable. this.types = new FeatureTypeAnalyzer(true); this.alignments = alignments_filename; packAlignments = (alignments != null); if (!packAlignments) { logger.info("No alignments file specified, skipping."); } else if (!new File(alignments_filename).exists()) { logger.severe("Alignments file does not exist: " + alignments); System.exit(0); } if (config_filename != null) { readConfig(config_filename); types.readConfig(config_filename); } else { logger.info("No config specified. Attempting auto-detection of feature types."); } File working_dir = new File(output); working_dir.mkdir(); if (!working_dir.exists()) { logger.severe("Failed creating output directory."); System.exit(0); } }