static void processFile(String fname, PrintStream out) throws IOException { System.err.println("Processing: " + fname); FileInputStream fio = new FileInputStream(new File(fname)); InputStreamReader fread = new InputStreamReader(fio, JetTest.encoding); BufferedReader fp = new BufferedReader(fread); StringBuffer buf = new StringBuffer(); int docno = 0, allsents = 0, processedsents = 0; while (true) { String line = fp.readLine(); // EOF or an empty line: the end of a Document. if (line == null || line.equals("")) { if (0 < buf.length()) { SGMLProcessor.allTags = true; Document doc = SGMLProcessor.sgmlToDoc(buf.toString(), (String[]) null); doc.setSGMLwrapMargin(0); System.err.println( "Doc-" + docno + ": sents=" + allsents + ", processed=" + processedsents); processDoc1(doc, docno); writeDoc1(doc, out); out.flush(); buf = new StringBuffer(); docno++; allsents = 0; processedsents = 0; } if (line == null) { break; } else { continue; } } if (line.startsWith("#")) { // "#" indicates a comment line. buf.append(line + "\n"); } else { allsents++; if (processedsents < MaxProcessSentences) { buf.append("<sentence>"); String[] words = line.split(" "); for (int i = 0; i < words.length; i++) { if (0 != words[i].length()) { buf.append("<token>" + words[i] + " </token>"); } } buf.append("</sentence>\n"); processedsents++; } } } fp.close(); fread.close(); fio.close(); return; }
static void writeDocRaw(Document doc, PrintStream out) throws IOException { out.println(doc.writeSGML(null).toString()); out.flush(); return; }