/** blast some random strings through the analyzer */
  @Test
  public void testRandomStrings() throws Exception {
    Analyzer a =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer =
                new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), -1);
            return new TokenStreamComponents(tokenizer);
          }
        };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();

    Analyzer b =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer =
                new PatternTokenizer(newAttributeFactory(), Pattern.compile("a"), 0);
            return new TokenStreamComponents(tokenizer);
          }
        };
    checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
    b.close();
  }
  public void test() throws Exception {
    final CharArraySet cas = new CharArraySet(3, false);
    cas.add("jjp");
    cas.add("wlmwoknt");
    cas.add("tcgyreo");

    final NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder();
    builder.add("mtqlpi", "");
    builder.add("mwoknt", "jjp");
    builder.add("tcgyreo", "zpfpajyws");
    final NormalizeCharMap map = builder.build();

    Analyzer a =
        new Analyzer() {
          @Override
          protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new MockTokenizer(MockTokenFilter.ENGLISH_STOPSET, false, -65);
            TokenFilter f = new CommonGramsFilter(t, cas);
            return new TokenStreamComponents(t, f);
          }

          @Override
          protected Reader initReader(String fieldName, Reader reader) {
            reader = new MockCharFilter(reader, 0);
            reader = new MappingCharFilter(map, reader);
            reader = new TestRandomChains.CheckThatYouDidntReadAnythingReaderWrapper(reader);
            return reader;
          }
        };
    checkAnalysisConsistency(random(), a, false, "wmgddzunizdomqyj");
    a.close();
  }
  // LUCENE-5725
  public void testMultiValues() throws Exception {
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});

    BooleanQuery query =
        (BooleanQuery)
            mlt.like(
                "text",
                new StringReader("lucene"),
                new StringReader("lucene release"),
                new StringReader("apache"),
                new StringReader("apache lucene"));
    Collection<BooleanClause> clauses = query.clauses();
    assertEquals("Expected 2 clauses only!", 2, clauses.size());
    for (BooleanClause clause : clauses) {
      Term term = ((TermQuery) clause.getQuery()).getTerm();
      assertTrue(
          Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term));
    }
    analyzer.close();
  }
 /** test use of exclusion set */
 public void testExclude() throws IOException {
   CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false);
   Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
   checkOneTerm(a, "llengües", "llengües");
   checkOneTerm(a, "llengua", "llengu");
   a.close();
 }
  @Override
  public void close() {
    if (stopped.compareAndSet(false, true)) { // make sure we only stop once
      try {
        worker.close();
      } catch (Exception e) {
        log.workerException(e);
      }

      this.allIndexesManager.stop();
      this.timingSource.stop();

      serviceManager.releaseAllServices();

      for (Analyzer an : this.analyzers.values()) {
        an.close();
      }
      for (AbstractDocumentBuilder documentBuilder :
          this.documentBuildersContainedEntities.values()) {
        documentBuilder.close();
      }
      for (EntityIndexBinding entityIndexBinding : this.indexBindingForEntities.values()) {
        entityIndexBinding.getDocumentBuilder().close();
      }

      // unregister statistic mbean
      if (statisticsMBeanName != null) {
        JMXRegistrar.unRegisterMBean(statisticsMBeanName);
      }
    }
  }
 /** test use of exclusion set */
 public void testExclude() throws IOException {
   CharArraySet exclusionSet = new CharArraySet(asSet("chicano"), false);
   Analyzer a = new SpanishAnalyzer(SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
   checkOneTerm(a, "chicana", "chican");
   checkOneTerm(a, "chicano", "chicano");
   a.close();
 }
 public void testCuriousWikipediaString() throws Exception {
   final CharArraySet protWords =
       new CharArraySet(
           new HashSet<>(
               Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")),
           false);
   final byte table[] =
       new byte[] {
         -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28,
         97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65,
         5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86,
         106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126,
         115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53,
         -9, -102, -18, 90, 94, -26, 31, 71, -20
       };
   Analyzer a =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new WikipediaTokenizer();
           TokenStream stream = new SopTokenFilter(tokenizer);
           stream = new WordDelimiterFilter(stream, table, -50, protWords);
           stream = new SopTokenFilter(stream);
           return new TokenStreamComponents(tokenizer, stream);
         }
       };
   checkAnalysisConsistency(random(), a, false, "B\u28c3\ue0f8[ \ud800\udfc2 </p> jb");
   a.close();
 }
 public void testWithStemExclusionSet() throws IOException {
   CharArraySet set = new CharArraySet(1, true);
   set.add("پیاوە");
   Analyzer a = new SoraniAnalyzer(CharArraySet.EMPTY_SET, set);
   assertAnalyzesTo(a, "پیاوە", new String[] {"پیاوە"});
   a.close();
 }
 /** test use of elisionfilter */
 public void testContractions() throws IOException {
   Analyzer a = new CatalanAnalyzer();
   assertAnalyzesTo(
       a,
       "Diccionari de l'Institut d'Estudis Catalans",
       new String[] {"diccion", "inst", "estud", "catalan"});
   a.close();
 }
  public void cleanText(String... inboundTexts) {
    try {
      final List<String> fields = Lists.newArrayList();
      for (String raw : inboundTexts) {
        //		        Tidy t = new Tidy();
        //		        t.setErrout(new PrintWriter(new ByteArrayOutputStream()));
        //		        StringWriter out = new StringWriter();
        //		        t.parse(new StringReader(raw), out);
        //		        String tidied = out.getBuffer().toString();
        //    		    logger.debug("{}",tidied);
        //		        AutoDetectParser p = new AutoDetectParser();
        //		        p.parse(new ByteArrayInputStream(raw.getBytes()),
        //		        		new TextContentHandler(new DefaultHandler()
        //		        {
        //		            @Override
        //		            public void characters(char[] ch, int start, int length) throws SAXException
        //		            {
        //		                CharBuffer buf = CharBuffer.wrap(ch, start, length);
        //		                String s = buf.toString();
        //		    		    logger.debug("{}",s);
        //		                fields.add(s);
        //		            }
        //		        }), new Metadata());
      }

      Analyzer analyzer = new StandardAnalyzer();
      //		    String joinedFields = Joiner.on(" ").join(fields).replaceAll("\\s+", " ");
      String joinedFields = Joiner.on(" ").join(inboundTexts).replaceAll("\\s+", " ");
      logger.debug("{}", joinedFields);
      StringReader in = new StringReader(joinedFields);
      TokenStream ts = analyzer.tokenStream("content", in);
      ts.reset();
      ts = new LowerCaseFilter(ts);

      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      List<String> words = Lists.newArrayList();
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.buffer();
        int termLen = termAtt.length();
        String w = new String(termBuffer, 0, termLen);
        words.add(w);
      }
      ts.end();
      ts.close();
      analyzer.close();
      scrubbedWords = new ArrayList<String>();
      for (String word : words) {
        if (word.length() >= MINWORDLEN && !stopwords.contains(word)) {
          scrubbedWords.add(word);
        } else {
          logger.debug("Ignoring word: {}", word);
        }
      }
      //		    this.scrubbedWords = words;
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
 /** test stopwords and stemming */
 public void testBasics() throws IOException {
   Analyzer a = new CatalanAnalyzer();
   // stemming
   checkOneTerm(a, "llengües", "llengu");
   checkOneTerm(a, "llengua", "llengu");
   // stopword
   assertAnalyzesTo(a, "un", new String[] {});
   a.close();
 }
 @Test
 @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869")
 public void testRandomStrings() throws Exception {
   Analyzer analyzer =
       new UIMABaseAnalyzer(
           "/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null);
   checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER);
   analyzer.close();
 }
 /** test stopwords and stemming */
 public void testBasics() throws IOException {
   Analyzer a = new SpanishAnalyzer();
   // stemming
   checkOneTerm(a, "chicana", "chican");
   checkOneTerm(a, "chicano", "chican");
   // stopword
   assertAnalyzesTo(a, "los", new String[] {});
   a.close();
 }
 @AfterClass
 public static void afterClass() throws Exception {
   reader.close();
   directory.close();
   analyzer.close();
   reader = null;
   directory = null;
   analyzer = null;
   s1 = s2 = null;
 }
 public static void main(String[] args) {
   server = new HttpSolrServer(DEFAULT_URL);
   Random random = new Random(100);
   Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
   readFileByLines(filename, analyzer, random);
   analyzer.close();
   server = null;
   System.runFinalization();
   System.gc();
 }
 // LUCENE-3326
 public void testMultiFields() throws Exception {
   MoreLikeThis mlt = new MoreLikeThis(reader);
   Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
   mlt.setAnalyzer(analyzer);
   mlt.setMinDocFreq(1);
   mlt.setMinTermFreq(1);
   mlt.setMinWordLen(1);
   mlt.setFieldNames(new String[] {"text", "foobar"});
   mlt.like("foobar", new StringReader("this is a test"));
   analyzer.close();
 }
 @Test
 @AwaitsFix(bugUrl = "https://issues.apache.org/jira/browse/LUCENE-3869")
 public void testRandomStringsWithConfigurationParameters() throws Exception {
   Map<String, Object> cp = new HashMap<>();
   cp.put("line-end", "\r");
   Analyzer analyzer =
       new UIMABaseAnalyzer(
           "/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp);
   checkRandomData(random(), analyzer, 100 * RANDOM_MULTIPLIER);
   analyzer.close();
 }
 public void testEmptyTerm() throws IOException {
   Analyzer a =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new KeywordTokenizer();
           return new TokenStreamComponents(tokenizer, new KStemFilter(tokenizer));
         }
       };
   checkOneTerm(a, "", "");
   a.close();
 }
 /** blast some random strings through the analyzer */
 public void testRandomStrings() throws Exception {
   Analyzer a =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
           return new TokenStreamComponents(tokenizer, new ReverseStringFilter(tokenizer));
         }
       };
   checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
   a.close();
 }
 // Adds random graph after:
 public void testRandomHugeStringsGraphAfter() throws Exception {
   Random random = random();
   Analyzer analyzer =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
           TokenStream tokenStream = new MockGraphTokenFilter(random(), tokenizer);
           return new TokenStreamComponents(tokenizer, tokenStream);
         }
       };
   checkRandomData(random, analyzer, 100 * RANDOM_MULTIPLIER, 8192);
   analyzer.close();
 }
  public void testTopN() throws Exception {
    int numDocs = 100;
    int topN = 25;

    // add series of docs with terms of decreasing df
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    for (int i = 0; i < numDocs; i++) {
      addDoc(writer, generateStrSeq(0, i + 1));
    }
    IndexReader reader = writer.getReader();
    writer.close();

    // setup MLT query
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMaxQueryTerms(topN);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});

    // perform MLT query
    String likeText = "";
    for (String text : generateStrSeq(0, numDocs)) {
      likeText += text + " ";
    }
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader(likeText));

    // check best terms are topN of highest idf
    Collection<BooleanClause> clauses = query.clauses();
    assertEquals("Expected" + topN + "clauses only!", topN, clauses.size());

    Term[] expectedTerms = new Term[topN];
    int idx = 0;
    for (String text : generateStrSeq(numDocs - topN, topN)) {
      expectedTerms[idx++] = new Term("text", text);
    }
    for (BooleanClause clause : clauses) {
      Term term = ((TermQuery) clause.getQuery()).getTerm();
      assertTrue(Arrays.asList(expectedTerms).contains(term));
    }

    // clean up
    reader.close();
    dir.close();
    analyzer.close();
  }
Exemple #22
0
  /**
   * Saves input documents in a format that can be read by the LDA model
   *
   * @param hits
   * @throws IOException
   */
  public void saveDocumentsToFile(ScoreDoc[] hits) throws IOException {
    String index = "Index_TREC";
    String field = "contents";

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_48, IndexerTREC.stopWordsSet);
    List<String> stopwords = new ArrayList<String>();
    for (int i = 0; i < IndexerTREC.STOP_WORDS.length; i++) {
      stopwords.add(IndexerTREC.STOP_WORDS[i]);
    }

    String docFileName = "model/newdocs.dat";
    File file = new File(docFileName);
    FileWriter filerWriterDocs = new FileWriter(file.getAbsoluteFile(), false);
    BufferedWriter bufferedWriterDocs = new BufferedWriter(filerWriterDocs);
    PrintWriter out = new PrintWriter(bufferedWriterDocs);
    Document doc;

    for (int i = 0; i < hits.length; i++) {
      String docContent = "";
      doc = searcher.doc(hits[i].doc);
      String content = doc.get("contents").toLowerCase();
      TokenStream tokenStream = analyzer.tokenStream("contents", doc.get("contents"));
      CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

      tokenStream.reset();
      while (tokenStream.incrementToken()) {
        String term = charTermAttribute.toString();
        if (!stopwords.contains(term.toLowerCase())) {
          docContent = docContent + " " + term;
        }
      }
      tokenStream.end();
      tokenStream.close();

      bufferedWriterDocs.write(docContent + "\n");
    }
    analyzer.close();
    bufferedWriterDocs.close();
    RandomAccessFile f = new RandomAccessFile(new File(docFileName), "rw");
    f.seek(0); // to the beginning
    String documentCount = Integer.toString(hits.length) + "\n";
    f.write(documentCount.getBytes());
    f.close();
  }
 // LUCENE-5269
 @Slow
 public void testUnicodeShinglesAndNgrams() throws Exception {
   Analyzer analyzer =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new EdgeNGramTokenizer(2, 94);
           // TokenStream stream = new SopTokenFilter(tokenizer);
           TokenStream stream = new ShingleFilter(tokenizer, 5);
           // stream = new SopTokenFilter(stream);
           stream = new NGramTokenFilter(stream, 55, 83);
           // stream = new SopTokenFilter(stream);
           return new TokenStreamComponents(tokenizer, stream);
         }
       };
   checkRandomData(random(), analyzer, 2000);
   analyzer.close();
 }
  public void testBoostFactor() throws Throwable {
    Map<String, Float> originalValues = getOriginalValues();

    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});
    mlt.setBoost(true);

    // this mean that every term boost factor will be multiplied by this
    // number
    float boostFactor = 5;
    mlt.setBoostFactor(boostFactor);

    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release"));
    Collection<BooleanClause> clauses = query.clauses();

    assertEquals(
        "Expected " + originalValues.size() + " clauses.", originalValues.size(), clauses.size());

    for (BooleanClause clause : clauses) {
      BoostQuery bq = (BoostQuery) clause.getQuery();
      TermQuery tq = (TermQuery) bq.getQuery();
      Float termBoost = originalValues.get(tq.getTerm().text());
      assertNotNull("Expected term " + tq.getTerm().text(), termBoost);

      float totalBoost = termBoost * boostFactor;
      assertEquals(
          "Expected boost of "
              + totalBoost
              + " for term '"
              + tq.getTerm().text()
              + "' got "
              + bq.getBoost(),
          totalBoost,
          bq.getBoost(),
          0.0001);
    }
    analyzer.close();
  }
  private Map<String, Float> getOriginalValues() throws IOException {
    Map<String, Float> originalValues = new HashMap<>();
    MoreLikeThis mlt = new MoreLikeThis(reader);
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    mlt.setAnalyzer(analyzer);
    mlt.setMinDocFreq(1);
    mlt.setMinTermFreq(1);
    mlt.setMinWordLen(1);
    mlt.setFieldNames(new String[] {"text"});
    mlt.setBoost(true);
    BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene release"));
    Collection<BooleanClause> clauses = query.clauses();

    for (BooleanClause clause : clauses) {
      BoostQuery bq = (BoostQuery) clause.getQuery();
      TermQuery tq = (TermQuery) bq.getQuery();
      originalValues.put(tq.getTerm().text(), bq.getBoost());
    }
    analyzer.close();
    return originalValues;
  }
 public void testIgnoreCaseNoSideEffects() throws Exception {
   final Dictionary d;
   // no multiple try-with to workaround bogus VerifyError
   InputStream affixStream = TestStemmer.class.getResourceAsStream("simple.aff");
   InputStream dictStream = TestStemmer.class.getResourceAsStream("simple.dic");
   try {
     d = new Dictionary(affixStream, Collections.singletonList(dictStream), true);
   } finally {
     IOUtils.closeWhileHandlingException(affixStream, dictStream);
   }
   Analyzer a =
       new Analyzer() {
         @Override
         protected TokenStreamComponents createComponents(String fieldName) {
           Tokenizer tokenizer = new KeywordTokenizer();
           return new TokenStreamComponents(tokenizer, new HunspellStemFilter(tokenizer, d));
         }
       };
   checkOneTerm(a, "NoChAnGy", "NoChAnGy");
   a.close();
 }
 public void testReusableTokenStream() throws Exception {
   Analyzer a = new GreekAnalyzer();
   // Verify the correct analysis of capitals and small accented letters, and
   // stemming
   assertAnalyzesTo(
       a,
       "Μία εξαιρετικά καλή και πλούσια σειρά χαρακτήρων της Ελληνικής γλώσσας",
       new String[] {"μια", "εξαιρετ", "καλ", "πλουσ", "σειρ", "χαρακτηρ", "ελληνικ", "γλωσσ"});
   // Verify the correct analysis of small letters with diaeresis and the elimination
   // of punctuation marks
   assertAnalyzesTo(
       a,
       "Προϊόντα (και)     [πολλαπλές] - ΑΝΑΓΚΕΣ",
       new String[] {"προιοντ", "πολλαπλ", "αναγκ"});
   // Verify the correct analysis of capital accented letters and capital letters with diaeresis,
   // as well as the elimination of stop words
   assertAnalyzesTo(
       a,
       "ΠΡΟΫΠΟΘΕΣΕΙΣ  Άψογος, ο μεστός και οι άλλοι",
       new String[] {"προυποθεσ", "αψογ", "μεστ", "αλλ"});
   a.close();
 }
 @Override
 public void tearDown() throws Exception {
   analyzer.close();
   super.tearDown();
 }
 /** blast some random strings through the analyzer */
 public void testRandomStrings() throws Exception {
   Analyzer a = new GreekAnalyzer();
   checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
   a.close();
 }
 @Override
 public void close() {
   fakeAnalyzer.close();
   super.close();
 }