/** Process files. */ protected static int processFiles(String[] args) { int result = 0; // Get file name/file wildcard specs. String[] wildCards = new String[args.length - INITPARAMS]; for (int i = INITPARAMS; i < args.length; i++) { wildCards[i - INITPARAMS] = args[i]; } // Expand wildcards to list of // file names. String[] fileNames = FileNameUtils.expandFileNameWildcards(wildCards); docsToProcess = fileNames.length; System.out.println( "There are " + Formatters.formatIntegerWithCommas(docsToProcess) + " documents to process."); // Process each file. for (int i = 0; i < fileNames.length; i++) { processOneFile(fileNames[i]); } return fileNames.length; }
/** * Process one file. * * <p>\u0040param xmlFileName Input file name to check for part of speech/lemma mismatches.. */ protected static void processOneFile(String xmlFileName) { try { // Report document being processed. System.out.println( "(" + ++currentDocNumber + "/" + docsToProcess + ") " + "processing " + xmlFileName); // Create filter to strip <w> and <c> // elements. XMLFilter filter = new StripAllWordElementsFilter(XMLReaderFactory.createXMLReader()); // Strip path from input file name. String strippedFileName = FileNameUtils.stripPathName(xmlFileName); strippedFileName = FileNameUtils.changeFileExtension(strippedFileName, ""); // Generate output file name. String xmlOutputFileName = new File(outputDirectoryName, strippedFileName + ".xml").getAbsolutePath(); // Make sure output directory exists. FileUtils.createPathForFile(xmlOutputFileName); // Copy input xml to output xml, // stripping <w> and <c> elements. new FilterAdornedFile(xmlFileName, xmlOutputFileName, filter); // Read it back and fix spacing. String fixedXML = FileUtils.readTextFile(xmlOutputFileName, "utf-8"); fixedXML = fixedXML.replaceAll("(\\s+)", " "); fixedXML = fixedXML.replaceAll(" ([\\.?!,;:\\)])", "\u00241"); fixedXML = fixedXML.replaceAll("\\( ", "("); fixedXML = fixedXML.replaceAll("\u00b6 ", "\u00b6"); fixedXML = fixedXML.replaceAll("__NS1:", ""); fixedXML = fixedXML.replaceAll("__NS1", ""); /* fixedXML = fixedXML.replaceAll ( "</__NS1:" , "" ); */ // Emit unadorned XML. SAXBuilder builder = new SAXBuilder(); Document document = builder.build(new StringReader(fixedXML)); new AdornedXMLWriter(document, xmlOutputFileName); } catch (Exception e) { System.out.println(" Error: " + e.getMessage()); e.printStackTrace(); } }