/** * Main program. * * <p>\u0040param args Program parameters. */ public static void main(String[] args) { // Initialize. try { if (!initialize(args)) { System.exit(1); } // Process all files. long startTime = System.currentTimeMillis(); int filesProcessed = processFiles(args); long processingTime = (System.currentTimeMillis() - startTime + 999) / 1000; // Terminate. terminate(filesProcessed, processingTime); } catch (Exception e) { System.out.println(e.getMessage()); } }
/** * Process one file. * * <p>\u0040param xmlFileName Input file name to check for part of speech/lemma mismatches.. */ protected static void processOneFile(String xmlFileName) { try { // Report document being processed. System.out.println( "(" + ++currentDocNumber + "/" + docsToProcess + ") " + "processing " + xmlFileName); // Create filter to strip <w> and <c> // elements. XMLFilter filter = new StripAllWordElementsFilter(XMLReaderFactory.createXMLReader()); // Strip path from input file name. String strippedFileName = FileNameUtils.stripPathName(xmlFileName); strippedFileName = FileNameUtils.changeFileExtension(strippedFileName, ""); // Generate output file name. String xmlOutputFileName = new File(outputDirectoryName, strippedFileName + ".xml").getAbsolutePath(); // Make sure output directory exists. FileUtils.createPathForFile(xmlOutputFileName); // Copy input xml to output xml, // stripping <w> and <c> elements. new FilterAdornedFile(xmlFileName, xmlOutputFileName, filter); // Read it back and fix spacing. String fixedXML = FileUtils.readTextFile(xmlOutputFileName, "utf-8"); fixedXML = fixedXML.replaceAll("(\\s+)", " "); fixedXML = fixedXML.replaceAll(" ([\\.?!,;:\\)])", "\u00241"); fixedXML = fixedXML.replaceAll("\\( ", "("); fixedXML = fixedXML.replaceAll("\u00b6 ", "\u00b6"); fixedXML = fixedXML.replaceAll("__NS1:", ""); fixedXML = fixedXML.replaceAll("__NS1", ""); /* fixedXML = fixedXML.replaceAll ( "</__NS1:" , "" ); */ // Emit unadorned XML. SAXBuilder builder = new SAXBuilder(); Document document = builder.build(new StringReader(fixedXML)); new AdornedXMLWriter(document, xmlOutputFileName); } catch (Exception e) { System.out.println(" Error: " + e.getMessage()); e.printStackTrace(); } }