예제 #1
0
  /** Generate the concordance for the given file. */
  private void execute() {
    TET tet = null;
    int pageno = 0;

    try {
      tet = new TET();
      tet.set_option(GLOBAL_OPTLIST);

      final int doc = tet.open_document(filename, DOC_OPTLIST);
      if (doc == -1) {
        System.err.println(
            "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
      } else {
        /*
         * Loop over pages in the document
         */
        final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");
        for (pageno = 1; pageno <= n_pages; ++pageno) {
          process_page(tet, doc, pageno);
        }

        print_concordance(tet, doc);

        tet.close_document(doc);
      }
    } catch (TETException e) {
      if (pageno == 0) {
        System.err.println(
            "Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n");
      } else {
        System.err.println(
            "Error "
                + e.get_errnum()
                + " in "
                + e.get_apiname()
                + "() on page "
                + pageno
                + ": "
                + e.get_errmsg()
                + "\n");
      }
      System.exit(1);
    } finally {
      tet.delete();
    }
  }
예제 #2
0
  /** @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */
  protected void doGet(HttpServletRequest request, HttpServletResponse response)
      throws ServletException, IOException {
    response.setContentType("text/html; charset=UTF-8");
    PrintWriter writer = response.getWriter();

    writer.println("<html>");
    writer.println("<head>");
    writer.println("<title>TET J2EE TETML Servlet Example<title>");
    writer.println("</head>");
    writer.println("<body>");
    writer.println("<pre>");

    /*
     * For JRE 1.4 the property must be set what XML parser to use, later
     * JREs seem to have a default set internally. It seems to be the case
     * that in 1.4 org.apache.crimson.parser.XMLReaderImpl is always
     * available.
     */
    String jre_version = System.getProperty("java.version");
    if (jre_version.startsWith("1.4")) {
      System.setProperty("org.xml.sax.driver", "org.apache.crimson.parser.XMLReaderImpl");
    }

    TET tet = null;
    try {
      tet = new TET();

      /* This is where input files live. Adjust as necessary. */
      ServletContext context = getServletContext();
      final String datapath = context.getRealPath("/WEB-INF/data");
      final String cmappath = context.getRealPath("/WEB-INF/resource/cmap");

      /** Global option list */
      final String globaloptlist = "searchpath={{" + datapath + "} {" + cmappath + "}}";

      tet.set_option(globaloptlist);

      final String tetmlname = infile + ".tetml";
      final String docoptlist =
          (inmemory ? "tetml={}" : "tetml={filename={" + tetmlname + "}}") + " " + basedocoptlist;

      if (inmemory) {
        writer.println("Processing TETML output for document \"" + infile + "\" in memory...");
      } else {
        writer.println(
            "Extracting TETML for document \"" + infile + "\" to file \"" + tetmlname + "\"...");
      }

      final int doc = tet.open_document(infile, docoptlist);
      if (doc == -1) {
        writer.println(
            "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
        tet.delete();
        return;
      }

      final int n_pages = (int) tet.pcos_get_number(doc, "length:pages");

      /*
       * Loop over pages in the document;
       */
      for (int pageno = 0; pageno <= n_pages; ++pageno) {
        tet.process_page(doc, pageno, pageoptlist);
      }

      /*
       * This could be combined with the last page-related call.
       */
      tet.process_page(doc, 0, "tetml={trailer}");

      if (inmemory) {
        /*
         * Get the XML document as a byte array.
         */
        final byte[] tetml = tet.get_xml_data(doc, "");

        if (tetml == null) {
          writer.println("tetml: couldn't retrieve XML data");
          return;
        }

        /*
         * Process the in-memory XML document to print out some
         * information that is extracted with the sax_handler class.
         */

        XMLReader reader = XMLReaderFactory.createXMLReader();
        reader.setContentHandler(new sax_handler(writer));
        reader.parse(new InputSource(new ByteArrayInputStream(tetml)));
        writer.println("Found " + word_count + " words in document");
      }

      tet.close_document(doc);
    } catch (TETException e) {
      writer.println(
          "Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg());
    } catch (Exception e) {
      e.printStackTrace(writer);
    } finally {
      writer.println("</pre>");
      writer.println("</body>");
      writer.println("</html>");
      writer.close();
      if (tet != null) {
        tet.delete();
      }
    }
  }
예제 #3
0
  /**
   * Process a single page of text.
   *
   * @param tet TET object
   * @param doc TET document handle
   * @param pageno Page to process
   * @throws TETException An error occurred in the TET API
   */
  private void process_page(TET tet, final int doc, int pageno) throws TETException {
    final int page = tet.open_page(doc, pageno, PAGE_OPTLIST);

    if (page == -1) {
      System.err.println(
          "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
    } else {
      /*
       * Fetch the text word-wise.
       */
      for (String text = tet.get_text(page); text != null; text = tet.get_text(page)) {
        /*
         * Only include words that start with a letter.
         */
        if (Character.isLetter(text.charAt(0))) {
          if (LOWERCASE_WORDS) {
            text = text.toLowerCase();
          }

          Integer value = (Integer) wordCounters.get(text);
          if (value != null) {
            // Increment counter
            value = new Integer(value.intValue() + 1);
          } else {
            // Initialize with first counted word
            value = new Integer(1);
          }
          wordCounters.put(text, value);
        }
      }

      if (tet.get_errnum() != 0) {
        System.err.println(
            "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg());
      }

      tet.close_page(page);
    }
  }