Ejemplo n.º 1
0
  /**
   * Creates a SAX XMLReader.
   *
   * @return <code>XMLReader</code> a SAX2 parser.
   * @throws Exception if no parser can be created.
   */
  protected XMLReader createParser() throws Exception {
    XMLReader parser = null;

    // Try using JAXP...
    // Note we need JAXP 1.1, and if JAXP 1.0 is all that's
    // available then the getXMLReader call fails and we skip
    // to the hard coded default parser
    try {
      Class factoryClass = Class.forName("javax.xml.parsers.SAXParserFactory");

      // MODIFIED: Added (Class[]) and (Object[]) cast sentences in "null" parameters
      // to avoid warnings during compilation

      // factory = SAXParserFactory.newInstance();
      Method newParserInstance = factoryClass.getMethod("newInstance", (Class[]) null);
      Object factory = newParserInstance.invoke(null, (Object[]) null);

      // jaxpParser = factory.newSAXParser();
      Method newSAXParser = factoryClass.getMethod("newSAXParser", (Class[]) null);
      Object jaxpParser = newSAXParser.invoke(factory, (Object[]) null);

      // parser = jaxpParser.getXMLReader();
      Class parserClass = jaxpParser.getClass();
      Method getXMLReader = parserClass.getMethod("getXMLReader", (Class[]) null);
      parser = (XMLReader) getXMLReader.invoke(jaxpParser, (Object[]) null);
    } catch (ClassNotFoundException e) {
      // e.printStackTrace();
    } catch (InvocationTargetException e) {
      // e.printStackTrace();
    } catch (NoSuchMethodException e) {
      // e.printStackTrace();
    } catch (IllegalAccessException e) {
      // e.printStackTrace();
    }

    // Check to see if we got a parser yet, if not, try to use a
    // hard coded default
    if (parser == null) {
      parser = XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
    }
    return parser;
  }
Ejemplo n.º 2
0
  /**
   * Process one file.
   *
   * <p>\u0040param xmlFileName Input file name to check for part of speech/lemma mismatches..
   */
  protected static void processOneFile(String xmlFileName) {
    try {
      //  Report document being processed.

      System.out.println(
          "(" + ++currentDocNumber + "/" + docsToProcess + ") " + "processing " + xmlFileName);
      //  Create filter to strip <w> and <c>
      //  elements.

      XMLFilter filter = new StripAllWordElementsFilter(XMLReaderFactory.createXMLReader());
      //  Strip path from input file name.

      String strippedFileName = FileNameUtils.stripPathName(xmlFileName);

      strippedFileName = FileNameUtils.changeFileExtension(strippedFileName, "");

      //  Generate output file name.

      String xmlOutputFileName =
          new File(outputDirectoryName, strippedFileName + ".xml").getAbsolutePath();

      //  Make sure output directory exists.

      FileUtils.createPathForFile(xmlOutputFileName);

      //  Copy input xml to output xml,
      //  stripping <w> and <c> elements.

      new FilterAdornedFile(xmlFileName, xmlOutputFileName, filter);
      //  Read it back and fix spacing.

      String fixedXML = FileUtils.readTextFile(xmlOutputFileName, "utf-8");

      fixedXML = fixedXML.replaceAll("(\\s+)", " ");

      fixedXML = fixedXML.replaceAll(" ([\\.?!,;:\\)])", "\u00241");

      fixedXML = fixedXML.replaceAll("\\( ", "(");

      fixedXML = fixedXML.replaceAll("\u00b6 ", "\u00b6");

      fixedXML = fixedXML.replaceAll("__NS1:", "");

      fixedXML = fixedXML.replaceAll("__NS1", "");
      /*
                  fixedXML    =
                      fixedXML.replaceAll
                      (
                          "</__NS1:" ,
                          ""
                      );
      */
      //  Emit unadorned XML.

      SAXBuilder builder = new SAXBuilder();

      Document document = builder.build(new StringReader(fixedXML));

      new AdornedXMLWriter(document, xmlOutputFileName);
    } catch (Exception e) {
      System.out.println("   Error: " + e.getMessage());

      e.printStackTrace();
    }
  }