/** * Creates a SAX XMLReader. * * @return <code>XMLReader</code> a SAX2 parser. * @throws Exception if no parser can be created. */ protected XMLReader createParser() throws Exception { XMLReader parser = null; // Try using JAXP... // Note we need JAXP 1.1, and if JAXP 1.0 is all that's // available then the getXMLReader call fails and we skip // to the hard coded default parser try { Class factoryClass = Class.forName("javax.xml.parsers.SAXParserFactory"); // MODIFIED: Added (Class[]) and (Object[]) cast sentences in "null" parameters // to avoid warnings during compilation // factory = SAXParserFactory.newInstance(); Method newParserInstance = factoryClass.getMethod("newInstance", (Class[]) null); Object factory = newParserInstance.invoke(null, (Object[]) null); // jaxpParser = factory.newSAXParser(); Method newSAXParser = factoryClass.getMethod("newSAXParser", (Class[]) null); Object jaxpParser = newSAXParser.invoke(factory, (Object[]) null); // parser = jaxpParser.getXMLReader(); Class parserClass = jaxpParser.getClass(); Method getXMLReader = parserClass.getMethod("getXMLReader", (Class[]) null); parser = (XMLReader) getXMLReader.invoke(jaxpParser, (Object[]) null); } catch (ClassNotFoundException e) { // e.printStackTrace(); } catch (InvocationTargetException e) { // e.printStackTrace(); } catch (NoSuchMethodException e) { // e.printStackTrace(); } catch (IllegalAccessException e) { // e.printStackTrace(); } // Check to see if we got a parser yet, if not, try to use a // hard coded default if (parser == null) { parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser"); } return parser; }
/** * Process one file. * * <p>\u0040param xmlFileName Input file name to check for part of speech/lemma mismatches.. */ protected static void processOneFile(String xmlFileName) { try { // Report document being processed. System.out.println( "(" + ++currentDocNumber + "/" + docsToProcess + ") " + "processing " + xmlFileName); // Create filter to strip <w> and <c> // elements. XMLFilter filter = new StripAllWordElementsFilter(XMLReaderFactory.createXMLReader()); // Strip path from input file name. String strippedFileName = FileNameUtils.stripPathName(xmlFileName); strippedFileName = FileNameUtils.changeFileExtension(strippedFileName, ""); // Generate output file name. String xmlOutputFileName = new File(outputDirectoryName, strippedFileName + ".xml").getAbsolutePath(); // Make sure output directory exists. FileUtils.createPathForFile(xmlOutputFileName); // Copy input xml to output xml, // stripping <w> and <c> elements. new FilterAdornedFile(xmlFileName, xmlOutputFileName, filter); // Read it back and fix spacing. String fixedXML = FileUtils.readTextFile(xmlOutputFileName, "utf-8"); fixedXML = fixedXML.replaceAll("(\\s+)", " "); fixedXML = fixedXML.replaceAll(" ([\\.?!,;:\\)])", "\u00241"); fixedXML = fixedXML.replaceAll("\\( ", "("); fixedXML = fixedXML.replaceAll("\u00b6 ", "\u00b6"); fixedXML = fixedXML.replaceAll("__NS1:", ""); fixedXML = fixedXML.replaceAll("__NS1", ""); /* fixedXML = fixedXML.replaceAll ( "</__NS1:" , "" ); */ // Emit unadorned XML. SAXBuilder builder = new SAXBuilder(); Document document = builder.build(new StringReader(fixedXML)); new AdornedXMLWriter(document, xmlOutputFileName); } catch (Exception e) { System.out.println(" Error: " + e.getMessage()); e.printStackTrace(); } }