Esempio n. 1
0
  public void createSenIndex(JSONArray jsonArray, String indexPath, String stopwordsFile)
      throws Exception {

    if (jsonArray == null) {
      System.out.println("error: jsonArray is null!\n");
      return;
    }

    Analyzer analyzer = null;
    if (stopwordsFile == null) {
      analyzer = new SimpleAnalyzer();
    } else {
      analyzer = new StopAnalyzer(Paths.get(stopwordsFile));
    }

    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter indexWriter = new IndexWriter(FSDirectory.open(Paths.get(indexPath)), iwc);
    indexWriter.deleteAll();

    TextUtil textUtil = new TextUtil();

    long startTime = new Date().getTime();

    System.out.println("jsonArray size: " + jsonArray.size());

    long num_sentence = 0;

    for (JSONObject jsonObj : (List<JSONObject>) jsonArray) {
      long id = (long) jsonObj.get(idKey);
      String review = (String) jsonObj.get(reviewKey);

      if (review == null || review.isEmpty()) {
        continue;
      }

      if (review.matches(".*[^\\x00-\\x7F].*")) {
        continue;
      }

      String[] tokens = textUtil.tokenize(review);
      if (tokens.length <= TERM_MIN_THRESHOLD) {
        continue;
      }

      String[] sentences = textUtil.sentenceDetect(review);
      // System.out.println(body.toLowerCase() + "\n");
      num_sentence = 0;

      for (int i = 0; i < sentences.length; i++) {
        if (sentences[i] == null || sentences[i].isEmpty()) {
          continue;
        }
        // System.out.println(sentences[i]);
        Document doc = new Document();
        Field idField = new LongField(idKey, id, Field.Store.YES);
        Field numField = new LongField("num", num_sentence, Field.Store.NO);
        Field contentField =
            new TextField(
                reviewKey,
                sentences[i].replaceAll("[_'.,]", " ").replaceAll("[0-9]", ""),
                Field.Store.YES);

        doc.add(idField);
        doc.add(numField);
        doc.add(contentField);

        indexWriter.addDocument(doc);
        num_sentence++;
      }
    }

    indexWriter.commit();
    indexWriter.close();

    long endTime = new Date().getTime();
    System.out.println("\n\ncreate index time: " + (endTime - startTime) + "ms");
    System.out.println("\n sentence num: " + num_sentence + "\n");
  }