Esempio n. 1
0
  private static Directory index(Analyzer analyzer, String processingPath) {
    RAMDirectory directory = null;
    IndexWriter indexWriter = null;
    try {
      directory = new RAMDirectory();
      IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer);
      indexWriter = new IndexWriter(directory, iwc);
      File file = new File(processingPath);
      index_h("", file, indexWriter);
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      if (indexWriter != null) {
        try {
          indexWriter.close();
        } catch (CorruptIndexException e1) {
          // TODO Auto-generated catch block
          e1.printStackTrace();
        } catch (IOException e1) {
          // TODO Auto-generated catch block
          e1.printStackTrace();
        }
      }
    }

    return directory;
  }
Esempio n. 2
0
  public void buildIndex(JSONObject indexData) {

    try {
      Directory dir = FSDirectory.open(new File(indexDir));
      IKAnalyzer analyzer = new IKAnalyzer();
      analyzer.setUseSmart(true);
      IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer);
      indexWriter = new IndexWriter(dir, iwc);
      indexWriter.deleteAll();

      JSONArray statusData = indexData.getJSONArray("statusData");
      for (int i = 0; i < statusData.length(); i++) {
        String text = statusData.getString(i);
        Document doc = new Document();
        doc.add(
            new Field(
                "text",
                text,
                Field.Store.YES,
                Field.Index.ANALYZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS));
        indexWriter.addDocument(doc);
      }

      JSONArray userData = indexData.getJSONArray("userData");
      for (int i = 0; i < userData.length(); i++) {
        String text = userData.getString(i);
        Document doc = new Document();
        doc.add(
            new Field(
                "text",
                text,
                Field.Store.YES,
                Field.Index.ANALYZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS));
        indexWriter.addDocument(doc);
      }
      // indexWriter.commit();
      System.out.println("Index is done");
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (JSONException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      try {
        indexWriter.close();
      } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
      }
    }
  }
 /** {@inheritDoc} */
 public void pageRemoved(WikiPage page) {
   IndexWriter writer = null;
   try {
     Directory luceneDir = new SimpleFSDirectory(new File(m_luceneDirectory), null);
     writer = getIndexWriter(luceneDir);
     Query query = new TermQuery(new Term(LUCENE_ID, page.getName()));
     writer.deleteDocuments(query);
   } catch (Exception e) {
     log.error("Unable to remove page '" + page.getName() + "' from Lucene index", e);
   } finally {
     close(writer);
   }
 }
Esempio n. 4
0
  private static void index_h(String prefix, File file, IndexWriter indexWriter)
      throws IOException {
    Document doc = null;

    if (file.isDirectory()) {
      File files[] = file.listFiles();
      for (File file1 : files) {
        index_h(prefix + FILE_SEPARATOR + file.getName(), file1, indexWriter);
      }
    } else {
      String content = FileUtils.readFileToString(file, "utf-8");

      System.out.println("==============================================================");
      System.out.println("index_h " + content);
      System.out.println("==============================================================");

      String filename = prefix + FILE_SEPARATOR + file.getName();
      String path = file.getAbsolutePath();

      doc = new Document();
      doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("relative_path", filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
      indexWriter.addDocument(doc);
    }
  }
 void close(IndexWriter writer) {
   try {
     if (writer != null) {
       writer.close(true);
     }
   } catch (IOException e) {
     log.error(e);
   }
 }
Esempio n. 6
0
 public NewIndexer(String indexDir2) throws IOException {
   // create the index
   if (indexWriter2 == null) {
     indexWriter2 =
         new IndexWriter(
             FSDirectory.open(new File("resultsList")),
             new IndexWriterConfig(
                 Version.LUCENE_36,
                 new EnglishAnalyzer(Version.LUCENE_36, StandardAnalyzer.STOP_WORDS_SET)));
     indexWriter2.deleteAll();
   }
 }
Esempio n. 7
0
 // add items to index
 public void newIndex(NewIndexItem newIndexItem) throws IOException {
   // deleting the item, if already exists
   // indexWriter.deleteDocuments(new Term(IndexItem.SEARCHLABEL,
   // indexItem.getSearchLabel().toString()));
   Document doc = new Document();
   doc.add(
       new Field(
           NewIndexItem.QUERYNUMBER,
           newIndexItem.getNewQueryNumber().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   doc.add(
       new Field(
           NewIndexItem.TERMID,
           newIndexItem.getNewTermID().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   doc.add(
       new Field(
           NewIndexItem.LABEL,
           newIndexItem.getNewLabel().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   doc.add(
       new Field(
           NewIndexItem.SEARCHLABEL,
           newIndexItem.getNewSearchLabel().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   doc.add(
       new Field(
           NewIndexItem.QUERYTERM,
           newIndexItem.getNewQueryTerm().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   doc.add(
       new Field(
           NewIndexItem.NUMOFRESULTS,
           newIndexItem.getNewNumOfResults().toString(),
           Field.Store.YES,
           Field.Index.ANALYZED));
   // add the document to the index
   indexWriter2.addDocument(doc);
 }
Esempio n. 8
0
 // Closing the index
 public void close() throws IOException {
   indexWriter2.close();
 }
Esempio n. 9
0
  public void createSenIndex(JSONArray jsonArray, String indexPath, String stopwordsFile)
      throws Exception {

    if (jsonArray == null) {
      System.out.println("error: jsonArray is null!\n");
      return;
    }

    Analyzer analyzer = null;
    if (stopwordsFile == null) {
      analyzer = new SimpleAnalyzer();
    } else {
      analyzer = new StopAnalyzer(Paths.get(stopwordsFile));
    }

    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter indexWriter = new IndexWriter(FSDirectory.open(Paths.get(indexPath)), iwc);
    indexWriter.deleteAll();

    TextUtil textUtil = new TextUtil();

    long startTime = new Date().getTime();

    System.out.println("jsonArray size: " + jsonArray.size());

    long num_sentence = 0;

    for (JSONObject jsonObj : (List<JSONObject>) jsonArray) {
      long id = (long) jsonObj.get(idKey);
      String review = (String) jsonObj.get(reviewKey);

      if (review == null || review.isEmpty()) {
        continue;
      }

      if (review.matches(".*[^\\x00-\\x7F].*")) {
        continue;
      }

      String[] tokens = textUtil.tokenize(review);
      if (tokens.length <= TERM_MIN_THRESHOLD) {
        continue;
      }

      String[] sentences = textUtil.sentenceDetect(review);
      // System.out.println(body.toLowerCase() + "\n");
      num_sentence = 0;

      for (int i = 0; i < sentences.length; i++) {
        if (sentences[i] == null || sentences[i].isEmpty()) {
          continue;
        }
        // System.out.println(sentences[i]);
        Document doc = new Document();
        Field idField = new LongField(idKey, id, Field.Store.YES);
        Field numField = new LongField("num", num_sentence, Field.Store.NO);
        Field contentField =
            new TextField(
                reviewKey,
                sentences[i].replaceAll("[_'.,]", " ").replaceAll("[0-9]", ""),
                Field.Store.YES);

        doc.add(idField);
        doc.add(numField);
        doc.add(contentField);

        indexWriter.addDocument(doc);
        num_sentence++;
      }
    }

    indexWriter.commit();
    indexWriter.close();

    long endTime = new Date().getTime();
    System.out.println("\n\ncreate index time: " + (endTime - startTime) + "ms");
    System.out.println("\n sentence num: " + num_sentence + "\n");
  }
Esempio n. 10
0
  /**
   * Indexes page using the given IndexWriter.
   *
   * @param page WikiPage
   * @param text Page text to index
   * @param writer The Lucene IndexWriter to use for indexing
   * @return the created index Document
   * @throws IOException If there's an indexing problem
   */
  protected Document luceneIndexPage(WikiPage page, String text, IndexWriter writer)
      throws IOException {
    if (log.isDebugEnabled()) log.debug("Indexing " + page.getName() + "...");

    // make a new, empty document
    Document doc = new Document();

    if (text == null) return doc;

    // Raw name is the keyword we'll use to refer to this document for updates.
    Field field = new Field(LUCENE_ID, page.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED);
    doc.add(field);

    // Body text.  It is stored in the doc for search contexts.
    field =
        new Field(
            LUCENE_PAGE_CONTENTS, text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO);
    doc.add(field);

    // Allow searching by page name. Both beautified and raw
    String unTokenizedTitle =
        StringUtils.replaceChars(
            page.getName(), MarkupParser.PUNCTUATION_CHARS_ALLOWED, c_punctuationSpaces);

    field =
        new Field(
            LUCENE_PAGE_NAME,
            TextUtil.beautifyString(page.getName()) + " " + unTokenizedTitle,
            Field.Store.YES,
            Field.Index.ANALYZED,
            Field.TermVector.NO);
    doc.add(field);

    // Allow searching by authorname

    if (page.getAuthor() != null) {
      field =
          new Field(
              LUCENE_AUTHOR,
              page.getAuthor(),
              Field.Store.YES,
              Field.Index.ANALYZED,
              Field.TermVector.NO);
      doc.add(field);
    }

    // Now add the names of the attachments of this page
    try {
      Collection attachments = m_engine.getAttachmentManager().listAttachments(page);
      String attachmentNames = "";

      for (Iterator it = attachments.iterator(); it.hasNext(); ) {
        Attachment att = (Attachment) it.next();
        attachmentNames += att.getName() + ";";
      }
      field =
          new Field(
              LUCENE_ATTACHMENTS,
              attachmentNames,
              Field.Store.YES,
              Field.Index.ANALYZED,
              Field.TermVector.NO);
      doc.add(field);

    } catch (ProviderException e) {
      // Unable to read attachments
      log.error("Failed to get attachments for page", e);
    }
    writer.addDocument(doc);

    return doc;
  }