コード例 #1
0
  /** Process files. */
  protected static int processFiles(String[] args) {
    int result = 0;
    //  Get file name/file wildcard specs.

    String[] wildCards = new String[args.length - INITPARAMS];

    for (int i = INITPARAMS; i < args.length; i++) {
      wildCards[i - INITPARAMS] = args[i];
    }
    //  Expand wildcards to list of
    //  file names.

    String[] fileNames = FileNameUtils.expandFileNameWildcards(wildCards);

    docsToProcess = fileNames.length;

    System.out.println(
        "There are "
            + Formatters.formatIntegerWithCommas(docsToProcess)
            + " documents to process.");
    //  Process each file.

    for (int i = 0; i < fileNames.length; i++) {
      processOneFile(fileNames[i]);
    }

    return fileNames.length;
  }
コード例 #2
0
  /**
   * Process one file.
   *
   * <p>\u0040param xmlFileName Input file name to check for part of speech/lemma mismatches..
   */
  protected static void processOneFile(String xmlFileName) {
    try {
      //  Report document being processed.

      System.out.println(
          "(" + ++currentDocNumber + "/" + docsToProcess + ") " + "processing " + xmlFileName);
      //  Create filter to strip <w> and <c>
      //  elements.

      XMLFilter filter = new StripAllWordElementsFilter(XMLReaderFactory.createXMLReader());
      //  Strip path from input file name.

      String strippedFileName = FileNameUtils.stripPathName(xmlFileName);

      strippedFileName = FileNameUtils.changeFileExtension(strippedFileName, "");

      //  Generate output file name.

      String xmlOutputFileName =
          new File(outputDirectoryName, strippedFileName + ".xml").getAbsolutePath();

      //  Make sure output directory exists.

      FileUtils.createPathForFile(xmlOutputFileName);

      //  Copy input xml to output xml,
      //  stripping <w> and <c> elements.

      new FilterAdornedFile(xmlFileName, xmlOutputFileName, filter);
      //  Read it back and fix spacing.

      String fixedXML = FileUtils.readTextFile(xmlOutputFileName, "utf-8");

      fixedXML = fixedXML.replaceAll("(\\s+)", " ");

      fixedXML = fixedXML.replaceAll(" ([\\.?!,;:\\)])", "\u00241");

      fixedXML = fixedXML.replaceAll("\\( ", "(");

      fixedXML = fixedXML.replaceAll("\u00b6 ", "\u00b6");

      fixedXML = fixedXML.replaceAll("__NS1:", "");

      fixedXML = fixedXML.replaceAll("__NS1", "");
      /*
                  fixedXML    =
                      fixedXML.replaceAll
                      (
                          "</__NS1:" ,
                          ""
                      );
      */
      //  Emit unadorned XML.

      SAXBuilder builder = new SAXBuilder();

      Document document = builder.build(new StringReader(fixedXML));

      new AdornedXMLWriter(document, xmlOutputFileName);
    } catch (Exception e) {
      System.out.println("   Error: " + e.getMessage());

      e.printStackTrace();
    }
  }