Esempio n. 1
0
  public static List<Unit> create(String query) throws Exception {
    // TODO Auto-generated method stub
    /*
    DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();

          DocumentBuilder db = dbf.newDocumentBuilder();

          Document document = db.parse(new File(Path + "req_result.xml"));

          NodeList list = document.getElementsByTagName("Pro");
          */
    if (hasInit == 0) {
      init();
      hasInit = 1;
    }
    File file = new File(Path + "TF-IDF_result_x.txt");
    OutputStreamWriter write = new OutputStreamWriter(new FileOutputStream(file), "utf-8");
    BufferedWriter writer = new BufferedWriter(write);

    System.out.println(query);
    List<Term> lis = ToAnalysis.parse(query);

    List<List<Unit>> res1 = new ArrayList<List<Unit>>();
    // List<List<Unit>> res2 = new ArrayList<List<Unit>>();
    for (int i = 0; i < lis.size(); i++) {
      String tem = lis.get(i).toString();
      System.out.println(tem);
      String[] ary = tem.split("/");
      String term = "";
      double e = ((ary[1].contains("w") || ary[1].contains("nr")) ? 100 : 1);
      if (!Stopwords.isstop(ary[0]) && ary.length > 0) {
        term = ary[0];
        System.out.println(e);
        List<Unit> t1 = TFIDF(term);

        res1.add(Normal(t1, e));
        // List<Unit> t2 = BM25(term);
        // res2.add(Normal(t2));
      }
    }
    Map<String, Double> res = new HashMap<String, Double>();
    for (int i = 0; i < res1.size(); i++) {
      for (int j = 0; j < res1.get(i).size(); j++) {
        String iDoc = res1.get(i).get(j).getDocId();
        if (res.containsKey(iDoc)) {
          double tem = res.get(iDoc);
          res.remove(iDoc);
          res.put(iDoc, tem + res1.get(i).get(j).getSrc());
        } else {
          res.put(iDoc, res1.get(i).get(j).getSrc());
        }
      }
    }
    List<Unit> iRes = new ArrayList<Unit>();
    for (String key : res.keySet()) {
      Unit t = new Unit();
      int id = docmap.get(key);
      t.setDocId(key);
      t.setChName(docs[id].getChName());
      t.setAddr(docs[id].getAddr());
      t.setURL(docs[id].getURL());
      t.setType(docs[id].getType());
      t.setSrc(res.get(key));
      iRes.add(t);
    }
    Collections.sort(iRes);
    System.out.println("size = " + iRes.size());
    for (int i = 0; i < iRes.size(); i++) {
      writer.write(iRes.get(i).getDocId() + " " + iRes.get(i).getSrc() + "\n");
    }
    writer.close();
    /*
          double maxx = 0,minn = 1e11;
          for(int i = 0;i < res.size();i ++){
          	//writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n");
          	double tem = res.get(i).getNum();
          	maxx = Math.max(maxx, tem);minn = Math.min(minn, tem);
          }
          for(int i = 0;i < res.size();i ++){
          	double tem = res.get(i).getNum();
          	res.get(i).setNum(10 * (tem - minn) / (maxx - minn));
          	writer.write(res.get(i).getDocId() + " " + res.get(i).getNum() + "\n");
          }
          writer.close();

          writer.close();
          file = new File("TF-IDF_result.txt");
          write = new OutputStreamWriter(new FileOutputStream(file),"utf-8");
    writer = new BufferedWriter(write);

    */
    System.out.println("done");

    return iRes;
  }