public void run() { int articleID = IDManager.getNextStemArticleID(); char[] w = new char[501]; Stemmer s = new Stemmer(); // NewsTextRetriever nt = new NewsTextRetriever(); while (articleID != 0) { String document = DBConnection.getInstance().getNewsText(articleID); if (document == null) { articleID = IDManager.getNextStemArticleID(); continue; } // replace useless stuff document = document.replace(";", ""); document = document.replace("!", ""); document = document.replace("?", ""); document = document.replace(",", ""); document = document.replace("'", ""); document = document.replace("´", ""); document = document.replace("`", ""); document = document.replace(".", ""); document = document.replace(":", ""); char[] text = document.toCharArray(); Article article = new Article(articleID); // Iterate over all words and stem them. Original algorithm int i = 0; while (i < text.length) { char ch = text[i]; i++; if (Character.isLetter((char) ch)) { int j = 0; while (i < text.length) { { ch = Character.toLowerCase((char) ch); w[j] = (char) ch; if (j < 500) j++; ch = text[i]; i++; if (!Character.isLetter((char) ch) || i == text.length) { if (i == text.length) { w[j] = (char) ch; if (j < 500) j++; } /* to test add(char ch) */ for (int c = 0; c < j; c++) s.add(w[c]); /* or, to test add(char[] w, int j) */ /* s.add(w, j); */ s.stem(); String stem = s.toString(); if (stem.length() < 255) { article.addArticleFeatureInstance(stem); break; } } } } if (ch < 0) break; } } DBConnection.getInstance().storeNewsStems(article); articleID = IDManager.getNextStemArticleID(); } }
public PhySimObj() { this(IDManager.getNextID()); }