Example #1
0
  public static void main(String[] args) throws ParseException, SAXException, TikaException {
    LuceneTester tester;
    try {
      tester = new LuceneTester();
      WriteFile = new WriteFile();
      Scanner scanner = new Scanner(System.in);
      // 이 machine의 JVM heap size를 출력. indexing을 하다가 OutOfMemoryError가 발생할 수 있기 때문.
      long heapSize = Runtime.getRuntime().totalMemory() / (1024 * 1024);
      System.out.println("Heap Size : " + heapSize + "MB");

      System.out.print("Index(y/n):");
      String IndexOrNot = scanner.nextLine();
      if (IndexOrNot.compareTo("y") == 0) {
        tester.createIndex();
      }
      scanner.close();
      // Setup tcpip server for query from C# front-end
      tcpIpServer = new TcpIpServer();
      tcpIpServer.setup(LuceneConstants.PORT);
      tcpIpServer.accept();
      // 쿼리가 하나 올때마다 해당하는 검색을 수행하는 루프.
      while (true) {
        String input = ""; // Query from c#
        ForSend_string = "[Search Result]\r\n"; // 실제적으로 사용하지는 아니하나, 디버깅과 검색이 완료되었다는 큐를 주기 위해 유지
        ForSend_json = new JSONObject();

        System.out.print("search query: ");
        input = tcpIpServer.read();
        // input = input.substring(1);
        System.out.println(input);
        /** ******메인 검색시에 수행 되는 서치************************ */
        if ((input.substring(0, 1).compareTo("m")) == 0) {
          // System.out.println("Main Search!" + input.substring(0, 1) +"  " + input.substring(1));
          tester.search(input.substring(1)); // search 수행
        }
        /** ******서브 검색시에 수행되는 서치************************ */
        if ((input.substring(0, 1).compareTo("s")) == 0) {
          // System.out.println("Sub Search!" + input.substring(0, 1) +"  " + input.substring(1));
          tester.sub_search(input.substring(1)); // sub-search 수행
        }
        // 검색 결과를 out.txt에 json format으로 저장. tcp전송은 큐를 날려주는 용도
        WriteFile.write(ForSend_json.toJSONString());
        tcpIpServer.write(ForSend_string);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
 /** Sets the comment with the input from customizer. */
 public void evaluateInput() {
   writeFile.setOutput(jTextField1.getText());
   writeFile.setOutputFile(jTextField2.getText());
   writeFile.setComment(jTextField3.getText());
 }
  public static void main(String[] args) throws IOException {

    try {
      String type = "rawWords";
      Utility u = new Utility(type);

      String folder = u.returnFolderName();

      Double avg_doc_length = u.returnAvgLength();
      Map<String, DocBean> docCatBean = u.getDocCat();
      Map<String, TokenCatalogBean> tokenCatBean = u.getTokenCat();

      System.out.println("Avg Doc Length:: " + avg_doc_length);
      System.out.println("DocCatBean Size:: " + docCatBean.size());
      System.out.println("Token Cat Bean:: " + tokenCatBean.size());
      System.out.println("Doc Bean:::: " + u.getDocBean().size());

      long vocabSize = tokenCatBean.size();
      System.out.println("Vocab Size ::" + vocabSize);

      System.out.println("Avg Doc Length:::: " + avg_doc_length);

      /* Method to read the query file */

      String query_file_path =
          "C:/Users/Nitin/NEU/Summer Sem/IR/Data/Assign 1/AP89_DATA/AP_DATA/query_desc.51-100.short.txt";
      /* This will be later replaced by path from the config file */
      GetFinalQueries qu = new GetFinalQueries();
      List<String> queries = qu.readQueryFile(query_file_path);

      /*
       * Method to get stopwords from the file and append the common words
       * from query file
       */

      /* Changed the split regex from space to space and hypen */
      List<String> stop_words_final = qu.getStopWords();

      /*
       * List<String> stop_words_final = new ArrayList<String>();
       *
       * for (int i = 0; i < stop_words_custom.length; i++) {
       * stop_words_final.add(stop_words_custom[i]); }
       */

      /*
       * Method to remove stopwords from query and just get the final
       * query
       */

      /*
       * Iterating the queries one by one. Each Query is a list of String
       * (Query Words)
       */
      List<List<String>> final_query = new ArrayList<List<String>>();

      final_query = qu.getFinalQueryList(queries, stop_words_final);

      /*
       * for (List<String> query : final_query) {
       * System.out.println("___________________________"); for (String q
       * : query) { System.out.println(q); }
       * System.out.println("___________________________"); }
       */

      List<List<String>> resultOkapi = new ArrayList<List<String>>();
      /* for (String query : queries) { */

      for (List<String> query : final_query) {
        Map<String, String> queryTFList = new HashMap<String, String>();
        Map<String, Double> rankTerm = new HashMap<String, Double>();

        String querynum = null;

        System.out.println("Query Minus stop words");
        System.out.println("=======================");
        // System.out.println(query.get(0));

        /*
         * For every word in a query calculates the okapif value and
         * sums it up
         */
        querynum = query.get(0).replace(".", "");

        // System.out.println("Query Numm::: "+ querynum);

        System.out.println("Query Size::" + query.size());
        for (int i = 1; i < query.size(); i++) {
          /* Method to calculate tfs for each term in query */
          String w = query.get(i).toLowerCase();

          Map<String, Integer> tfMap = new HashMap<String, Integer>();
          // System.out.println("Calculating for Word::: " + q);
          tfMap =
              laplaceSmoothing(
                  w.replaceAll("[,\"()]", ""),
                  avg_doc_length,
                  tokenCatBean,
                  docCatBean,
                  u,
                  vocabSize,
                  folder);

          System.out.println(
              "Size of TF Results:: " + tfMap.size() + "for :" + w.replaceAll("[,\"()]", ""));

          for (Map.Entry<String, Integer> term : tfMap.entrySet()) {

            if (queryTFList.get(term.getKey()) == null) {
              queryTFList.put(term.getKey(), term.getValue().toString());
            } else {
              queryTFList.put(
                  term.getKey(), queryTFList.get(term.getKey()) + " " + term.getValue().toString());

              // System.out.println("TF:: "+queryTFList.get(term.getKey()));
            }
          }
        }

        System.out.println("Final DOc List Size::: " + queryTFList.size());
        System.out.println("Calculating Laplace Smoothing Score for each ::::::: ");
        for (Map.Entry<String, String> d : queryTFList.entrySet()) {

          double docLen = getDocLength(d.getKey(), docCatBean);

          rankTerm.put(
              d.getKey(),
              laplacePerTerm(d.getValue(), docLen, avg_doc_length, vocabSize, query.size()));
        }
        /* Method to Sort Hashmap based on the value */
        SortMap sm = new SortMap();

        LinkedHashMap<String, Double> sortedRanks =
            (LinkedHashMap<String, Double>) sm.getSortedRankMap(rankTerm);

        int j = 1;
        List<String> queryResults = new ArrayList<String>();

        for (Entry<String, Double> term : sortedRanks.entrySet()) {

          if (j <= 1000) {

            String toWrite =
                querynum
                    + " "
                    + "Q0"
                    + " "
                    + term.getKey()
                    + " "
                    + j
                    + " "
                    + term.getValue()
                    + " "
                    + "EXP";

            // System.out.println(toWrite);

            queryResults.add(toWrite);

          } else {
            // bw.newLine();
            break;
          }

          j++;
        }

        resultOkapi.add(queryResults);
      }

      WriteFile w = new WriteFile();
      w.writeToFile(resultOkapi, "Laplace-1.txt", type);
      // node.close();

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
  }