public boolean dehyphen() { if (listener != null) listener.progress(1); vd.showPerlMessage("Checking files...\n"); if (hasProblems()) { vd.showPerlMessage(""); vd.showPerlMessage("Files with problems are listed above. \n"); vd.showPerlMessage( "Run this step again after the above identified problems are corrected.\n"); listener.progress(0); return false; } else { vd.showPerlMessage("File checking completed. \n"); listener.progress(5); vd.showPerlMessage("Pre-processing files... \n"); fillInWords(); if (listener != null) listener.progress(50); DeHyphenizer dh = new DeHyphenizerCorrected( this.database, this.tablename, "word", "count", "-", this.glossarytable, glossary); try { Statement stmt = conn.createStatement(); ResultSet rs = stmt.executeQuery("select word from " + tablename + " where word like '%-%'"); while (rs.next()) { String word = rs.getString("word"); String dhword = dh.normalFormat(word) .replaceAll( "-", "_"); // so dhwords in _allwords table are comparable to words in _wordpos and // other tables. Statement stmt1 = conn.createStatement(); stmt1.execute( "update " + tablename + " set dhword ='" + dhword + "' where word='" + word + "'"); mapping.put(word, dhword); } stmt.execute("update " + tablename + " set dhword=word where dhword is null"); } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:dehyphen", e); e.printStackTrace(); } normalizeDocument(); if (listener != null) listener.progress(100); return true; } }
/** check for unmatched brackets too. */ private void fillInWords() { try { Statement stmt = conn.createStatement(); ResultSet rs = null; File[] flist = folder.listFiles(); int total = flist.length; for (int i = 0; i < flist.length; i++) { BufferedReader reader = new BufferedReader(new FileReader(flist[i])); String line = null; StringBuffer sb = new StringBuffer(); while ((line = reader.readLine()) != null) { line = line.replaceAll(System.getProperty("line.separator"), " "); sb.append(line); } reader.close(); String text = sb.toString(); text = text.toLowerCase(); text = text.replaceAll("<[^<]+?>", " "); text = text.replaceAll("\\d", " "); text = text.replaceAll("\\(", " ( "); text = text.replaceAll("\\)", " ) "); text = text.replaceAll("\\[", " [ "); text = text.replaceAll("\\]", " ] "); text = text.replaceAll("\\{", " { "); text = text.replaceAll("\\}", " } "); text = text.replaceAll("\\s+", " ").trim(); String[] words = text.split("\\s+"); int lround = 0; int lsquare = 0; int lcurly = 0; int inbracket = 0; for (int j = 0; j < words.length; j++) { String w = words[j].trim(); if (w.compareTo("(") == 0) lround++; else if (w.compareTo(")") == 0) lround--; else if (w.compareTo("[") == 0) lsquare++; else if (w.compareTo("]") == 0) lsquare--; else if (w.compareTo("{") == 0) lcurly++; else if (w.compareTo("}") == 0) lcurly--; else { w = w.replaceAll("[^-a-z]", " ").trim(); if (w.matches(".*?\\w.*")) { if (lround + lsquare + lcurly > 0) { inbracket = 1; } else { inbracket = 0; } int count = 1; rs = stmt.executeQuery( "select word, count, inbrackets from " + tablename + " where word='" + w + "'"); if (rs.next()) { // normal word exist count += rs.getInt("count"); inbracket *= rs.getInt("inbrackets"); } stmt.execute("delete from " + tablename + " where word ='" + w + "'"); stmt.execute( "insert into " + tablename + " (word, count, inbrackets) values('" + w + "', " + count + "," + inbracket + ")"); } } } listener.progress(5 + i * 45 / total); /*while ((line = reader.readLine()) != null) { line = line.toLowerCase(); line = line.replaceAll("<[^<]+?>", " "); //for xml or html docs line = line.replaceAll(num, " "); line = line.replaceAll("[^-a-z]", " "); line = normalize(line); Statement stmt = conn.createStatement(); ResultSet rs = null; String[] words = line.split("\\s+"); for(int j = 0; j < words.length; j++){ String w = words[j].trim(); if(w.matches(".*?\\w.*")){ int count = 1; rs = stmt.executeQuery("select word, count from "+tablename+" where word='"+w+"'"); if(rs.next()){ count = rs.getInt("count")+1; } stmt.execute("delete from "+tablename+" where word ='"+w+"'"); stmt.execute("insert into "+tablename+" (word, count) values('"+w+"', "+count+")"); } } rs.close(); stmt.close(); }*/ } rs.close(); stmt.close(); } catch (Exception e) { LOGGER.error("Problem in VolumeDehyphenizer:fillInWords", e); e.printStackTrace(); } }