public static Dataset fromfile(String fname) { Dataset dt = new Dataset(fname); dt.parsestories(); Stopwords.genstopwords(); return dt; }
public StopWordsRemover( boolean useScientificStopWords, int minWordLength, String lang, boolean local) { this.useScientificStopWords = useScientificStopWords; this.local = local; this.lang = lang; this.minWordLength = minWordLength; this.stopwordsForOneLanguage = new ArrayList(Stopwords.getStopWords(lang, local)); nbStopWordsShort = Math.min(200, Math.max(0, (stopwordsForOneLanguage.size() - 1))); nbStopWords = Math.min(5000, Math.max(0, (stopwordsForOneLanguage.size() - 1))); try { init(); } catch (IOException ex) { Logger.getLogger(StopWordsRemover.class.getName()).log(Level.SEVERE, null, ex); } }
private void init() throws IOException { setKeepWords = new HashSet(); setStopWords = new HashSet(); setStopWordsScientificOrShort = new HashSet(); setStopWordsShort = new HashSet(); listGeneralStopwordsLarge = stopwordsForOneLanguage.subList(0, nbStopWords); listGeneralStopwordsShort = stopwordsForOneLanguage.subList(0, nbStopWordsShort); setStopWords.addAll(listGeneralStopwordsLarge); setStopWordsScientificOrShort.addAll(setStopWordsShort); setStopWordsShort.addAll(listGeneralStopwordsShort); if (useScientificStopWords) { setStopwordsScientific = Stopwords.getScientificStopwords(local, lang); setStopWords.addAll(setStopwordsScientific); setStopWordsScientificOrShort.addAll(setStopwordsScientific); } }
public static List<Unit> create(String query) throws Exception { // TODO Auto-generated method stub /* DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); Document document = db.parse(new File(Path + "req_result.xml")); NodeList list = document.getElementsByTagName("Pro"); */ if (hasInit == 0) { init(); hasInit = 1; } File file = new File(Path + "TF-IDF_result_x.txt"); OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(file), "utf-8"); BufferedWriter writer = new BufferedWriter(write); System.out.println(query); List<Term> lis = ToAnalysis.parse(query); List<List<Unit>> res1 = new ArrayList<List<Unit>>(); // List<List<Unit>> res2 = new ArrayList<List<Unit>>(); for (int i = 0; i < lis.size(); i++) { String tem = lis.get(i).toString(); System.out.println(tem); String[] ary = tem.split("/"); String term = ""; double e = ((ary[1].contains("w") || ary[1].contains("nr")) ? 100 : 1); if (!Stopwords.isstop(ary[0]) && ary.length > 0) { term = ary[0]; System.out.println(e); List<Unit> t1 = TFIDF(term); res1.add(Normal(t1, e)); // List<Unit> t2 = BM25(term); // res2.add(Normal(t2)); } } Map<String, Double> res = new HashMap<String, Double>(); for (int i = 0; i < res1.size(); i++) { for (int j = 0; j < res1.get(i).size(); j++) { String iDoc = res1.get(i).get(j).getDocId(); if (res.containsKey(iDoc)) { double tem = res.get(iDoc); res.remove(iDoc); res.put(iDoc, tem + res1.get(i).get(j).getSrc()); } else { res.put(iDoc, res1.get(i).get(j).getSrc()); } } } List<Unit> iRes = new ArrayList<Unit>(); for (String key : res.keySet()) { Unit t = new Unit(); int id = docmap.get(key); t.setDocId(key); t.setChName(docs[id].getChName()); t.setAddr(docs[id].getAddr()); t.setURL(docs[id].getURL()); t.setType(docs[id].getType()); t.setSrc(res.get(key)); iRes.add(t); } Collections.sort(iRes); System.out.println("size = " + iRes.size()); for (int i = 0; i < iRes.size(); i++) { writer.write(iRes.get(i).getDocId() + " " + iRes.get(i).getSrc() + "\n"); } writer.close(); /* double maxx = 0,minn = 1e11; for(int i = 0;i < res.size();i ++){ //writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); double tem = res.get(i).getNum(); maxx = Math.max(maxx, tem);minn = Math.min(minn, tem); } for(int i = 0;i < res.size();i ++){ double tem = res.get(i).getNum(); res.get(i).setNum(10 * (tem - minn) / (maxx - minn)); writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n"); } writer.close(); writer.close(); file = new File("TF-IDF_result.txt"); write = new OutputStreamWriter(new FileOutputStream(file),"utf-8"); writer = new BufferedWriter(write); */ System.out.println("done"); return iRes; }