private void init(File ngtName) throws Exception { br = new BufferedReader(new FileReader(ngtName)); trans = new Transcoder( ClassLoader.getSystemResourceAsStream("eu/himeros/resources/transcoders/low2up.txt")); ngtTm = new TreeMap<>(); ngtAl = new ArrayList<>(300000); prolog = new StringBuilder("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n"); prolog.append( "<head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />\n"); prolog.append("<meta content=\"riguadon 0.3\" name=\"ocr-system\" />\n"); prolog.append("<meta name=\"ocr-nmber-of-pages\" content=\"???\" />\n"); prolog.append("<meta name=\"ocr-langs\" content=\"grc lat\" />\n"); prolog.append("<meta content=\"ocr_line ocr_page\" name=\"ocr-capabilities\" />\n"); prolog.append("<link href=\"hocraggregate.css\" rel=\"stylesheet\" type=\"text/css\"/>\n"); String line; String it1; String it2; String it3; String key; while ((line = br.readLine()) != null) { ngtAl.add(line); } ngtAl.add("###"); ngtAl.add("###"); for (int i = 0; i < ngtAl.size() - 2; i++) { it1 = trans.parse(ngtAl.get(i)); it2 = trans.parse(ngtAl.get(i + 1)); it3 = trans.parse(ngtAl.get(i + 2)); key = (new StringBuilder(it1)).append(it2).append(it3).toString(); if (ngtTm.containsKey(key)) { ngtTm.remove(key); } else { ngtTm.put(key, i); } } br.close(); }
private void parseOcrWord(Element ocrWord) { String word = trans.parse(ocrWord.getText()).replaceAll("[^Α-Ω’]*", "").trim(); if (word.length() == 0) return; ocrAl.add(word); }