Beispiel #1
0
  public static Dataset fromfile(String fname) {

    Dataset dt = new Dataset(fname);

    dt.parsestories();

    Stopwords.genstopwords();

    return dt;
  }
Beispiel #2
0
 public StopWordsRemover(
     boolean useScientificStopWords, int minWordLength, String lang, boolean local) {
   this.useScientificStopWords = useScientificStopWords;
   this.local = local;
   this.lang = lang;
   this.minWordLength = minWordLength;
   this.stopwordsForOneLanguage = new ArrayList(Stopwords.getStopWords(lang, local));
   nbStopWordsShort = Math.min(200, Math.max(0, (stopwordsForOneLanguage.size() - 1)));
   nbStopWords = Math.min(5000, Math.max(0, (stopwordsForOneLanguage.size() - 1)));
   try {
     init();
   } catch (IOException ex) {
     Logger.getLogger(StopWordsRemover.class.getName()).log(Level.SEVERE, null, ex);
   }
 }
Beispiel #3
0
  private void init() throws IOException {
    setKeepWords = new HashSet();
    setStopWords = new HashSet();
    setStopWordsScientificOrShort = new HashSet();
    setStopWordsShort = new HashSet();

    listGeneralStopwordsLarge = stopwordsForOneLanguage.subList(0, nbStopWords);
    listGeneralStopwordsShort = stopwordsForOneLanguage.subList(0, nbStopWordsShort);

    setStopWords.addAll(listGeneralStopwordsLarge);
    setStopWordsScientificOrShort.addAll(setStopWordsShort);
    setStopWordsShort.addAll(listGeneralStopwordsShort);

    if (useScientificStopWords) {
      setStopwordsScientific = Stopwords.getScientificStopwords(local, lang);
      setStopWords.addAll(setStopwordsScientific);
      setStopWordsScientificOrShort.addAll(setStopwordsScientific);
    }
  }
Beispiel #4
0
  public static List<Unit> create(String query) throws Exception {
    // TODO Auto-generated method stub
    /*
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

          DocumentBuilder db = dbf.newDocumentBuilder();

          Document document = db.parse(new File(Path + "req_result.xml"));

          NodeList list = document.getElementsByTagName("Pro");
          */
    if (hasInit == 0) {
      init();
      hasInit = 1;
    }
    File file = new File(Path + "TF-IDF_result_x.txt");
    OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(file), "utf-8");
    BufferedWriter writer = new BufferedWriter(write);

    System.out.println(query);
    List<Term> lis = ToAnalysis.parse(query);

    List<List<Unit>> res1 = new ArrayList<List<Unit>>();
    // List<List<Unit>> res2 = new ArrayList<List<Unit>>();
    for (int i = 0; i < lis.size(); i++) {
      String tem = lis.get(i).toString();
      System.out.println(tem);
      String[] ary = tem.split("/");
      String term = "";
      double e = ((ary[1].contains("w") || ary[1].contains("nr")) ? 100 : 1);
      if (!Stopwords.isstop(ary[0]) && ary.length > 0) {
        term = ary[0];
        System.out.println(e);
        List<Unit> t1 = TFIDF(term);

        res1.add(Normal(t1, e));
        // List<Unit> t2 = BM25(term);
        // res2.add(Normal(t2));
      }
    }
    Map<String, Double> res = new HashMap<String, Double>();
    for (int i = 0; i < res1.size(); i++) {
      for (int j = 0; j < res1.get(i).size(); j++) {
        String iDoc = res1.get(i).get(j).getDocId();
        if (res.containsKey(iDoc)) {
          double tem = res.get(iDoc);
          res.remove(iDoc);
          res.put(iDoc, tem + res1.get(i).get(j).getSrc());
        } else {
          res.put(iDoc, res1.get(i).get(j).getSrc());
        }
      }
    }
    List<Unit> iRes = new ArrayList<Unit>();
    for (String key : res.keySet()) {
      Unit t = new Unit();
      int id = docmap.get(key);
      t.setDocId(key);
      t.setChName(docs[id].getChName());
      t.setAddr(docs[id].getAddr());
      t.setURL(docs[id].getURL());
      t.setType(docs[id].getType());
      t.setSrc(res.get(key));
      iRes.add(t);
    }
    Collections.sort(iRes);
    System.out.println("size = " + iRes.size());
    for (int i = 0; i < iRes.size(); i++) {
      writer.write(iRes.get(i).getDocId() + " " + iRes.get(i).getSrc() + "\n");
    }
    writer.close();
    /*
          double maxx = 0,minn = 1e11;
          for(int i = 0;i < res.size();i ++){
          	//writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n");
          	double tem = res.get(i).getNum();
          	maxx = Math.max(maxx, tem);minn = Math.min(minn, tem);
          }
          for(int i = 0;i < res.size();i ++){
          	double tem = res.get(i).getNum();
          	res.get(i).setNum(10 * (tem - minn) / (maxx - minn));
          	writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n");
          }
          writer.close();

          writer.close();
          file = new File("TF-IDF_result.txt");
          write = new OutputStreamWriter(new FileOutputStream(file),"utf-8");
    writer = new BufferedWriter(write);

    */
    System.out.println("done");

    return iRes;
  }