/** Generate the concordance for the given file. */ private void execute() { TET tet = null; int pageno = 0; try { tet = new TET(); tet.set_option(GLOBAL_OPTLIST); final int doc = tet.open_document(filename, DOC_OPTLIST); if (doc == -1) { System.err.println( "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); } else { /* * Loop over pages in the document */ final int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); for (pageno = 1; pageno <= n_pages; ++pageno) { process_page(tet, doc, pageno); } print_concordance(tet, doc); tet.close_document(doc); } } catch (TETException e) { if (pageno == 0) { System.err.println( "Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg() + "\n"); } else { System.err.println( "Error " + e.get_errnum() + " in " + e.get_apiname() + "() on page " + pageno + ": " + e.get_errmsg() + "\n"); } System.exit(1); } finally { tet.delete(); } }
/** @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response) */ protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException { response.setContentType("text/html; charset=UTF-8"); PrintWriter writer = response.getWriter(); writer.println("<html>"); writer.println("<head>"); writer.println("<title>TET J2EE TETML Servlet Example<title>"); writer.println("</head>"); writer.println("<body>"); writer.println("<pre>"); /* * For JRE 1.4 the property must be set what XML parser to use, later * JREs seem to have a default set internally. It seems to be the case * that in 1.4 org.apache.crimson.parser.XMLReaderImpl is always * available. */ String jre_version = System.getProperty("java.version"); if (jre_version.startsWith("1.4")) { System.setProperty("org.xml.sax.driver", "org.apache.crimson.parser.XMLReaderImpl"); } TET tet = null; try { tet = new TET(); /* This is where input files live. Adjust as necessary. */ ServletContext context = getServletContext(); final String datapath = context.getRealPath("/WEB-INF/data"); final String cmappath = context.getRealPath("/WEB-INF/resource/cmap"); /** Global option list */ final String globaloptlist = "searchpath={{" + datapath + "} {" + cmappath + "}}"; tet.set_option(globaloptlist); final String tetmlname = infile + ".tetml"; final String docoptlist = (inmemory ? "tetml={}" : "tetml={filename={" + tetmlname + "}}") + " " + basedocoptlist; if (inmemory) { writer.println("Processing TETML output for document \"" + infile + "\" in memory..."); } else { writer.println( "Extracting TETML for document \"" + infile + "\" to file \"" + tetmlname + "\"..."); } final int doc = tet.open_document(infile, docoptlist); if (doc == -1) { writer.println( "Error " + tet.get_errnum() + " in " + tet.get_apiname() + "(): " + tet.get_errmsg()); tet.delete(); return; } final int n_pages = (int) tet.pcos_get_number(doc, "length:pages"); /* * Loop over pages in the document; */ for (int pageno = 0; pageno <= n_pages; ++pageno) { tet.process_page(doc, pageno, pageoptlist); } /* * This could be combined with the last page-related call. */ tet.process_page(doc, 0, "tetml={trailer}"); if (inmemory) { /* * Get the XML document as a byte array. */ final byte[] tetml = tet.get_xml_data(doc, ""); if (tetml == null) { writer.println("tetml: couldn't retrieve XML data"); return; } /* * Process the in-memory XML document to print out some * information that is extracted with the sax_handler class. */ XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(new sax_handler(writer)); reader.parse(new InputSource(new ByteArrayInputStream(tetml))); writer.println("Found " + word_count + " words in document"); } tet.close_document(doc); } catch (TETException e) { writer.println( "Error " + e.get_errnum() + " in " + e.get_apiname() + "(): " + e.get_errmsg()); } catch (Exception e) { e.printStackTrace(writer); } finally { writer.println("</pre>"); writer.println("</body>"); writer.println("</html>"); writer.close(); if (tet != null) { tet.delete(); } } }