Пример #1
0
  public void testRange() throws IOException, InvalidGeoException {

    double k1 = 2.0d;
    double b = 0.75d;
    double epslon = 0.05d;
    QueryConfiguration queryConfiguration = new QueryConfiguration();
    queryConfiguration.setProperty("bm25.idf.policy", "floor_epslon");
    queryConfiguration.setProperty("bm25.idf.epslon", "" + epslon);
    queryConfiguration.setProperty("bm25.k1", "" + k1);
    queryConfiguration.setProperty("bm25.b", "" + b);

    LgteIndexSearcherWrapper searcher =
        new LgteIndexSearcherWrapper(Model.OkapiBM25Model, pathUnique);
    IndexReader readerMulti1 = LgteIndexManager.openReader(pathMulti1, Model.OkapiBM25Model);
    IndexReader readerMulti2 = LgteIndexManager.openReader(pathMulti2, Model.OkapiBM25Model);
    Map<String, IndexReader> readers = new HashMap<String, IndexReader>();
    readers.put("contents1", readerMulti1);
    readers.put("contents2", readerMulti2);

    LgteIndexSearcherWrapper searcherMulti =
        new LgteIndexSearcherWrapper(Model.OkapiBM25Model, new LgteIsolatedIndexReader(readers));

    try {
      LgteQuery lgteQuery =
          LgteQueryParser.parseQuery(
              "contents1:(word2 word67 word1*) contents2:(word1* word2 word67)",
              searcher,
              queryConfiguration);
      LgteHits lgteHits = searcher.search(lgteQuery);
      LgteQuery lgteQueryMulti =
          LgteQueryParser.parseQuery(
              "contents1:(word1* word2 word67) contents2:(word1* word2 word67)",
              searcherMulti,
              queryConfiguration);
      LgteHits lgteHitsMulti = searcherMulti.search(lgteQueryMulti);

      System.out.println("EXPECTED");
      System.out.println(
          "doc:" + lgteHits.doc(0).get(Globals.DOCUMENT_ID_FIELD) + ":" + lgteHits.score(0));
      System.out.println(
          "doc:" + lgteHits.doc(1).get(Globals.DOCUMENT_ID_FIELD) + ":" + lgteHits.score(1));
      System.out.println(
          "doc:" + lgteHits.doc(2).get(Globals.DOCUMENT_ID_FIELD) + ":" + lgteHits.score(2));
      System.out.println("RETURN:");
      System.out.println(
          "doc:"
              + lgteHitsMulti.doc(0).get(Globals.DOCUMENT_ID_FIELD)
              + ":"
              + lgteHitsMulti.score(0));
      System.out.println(
          "doc:"
              + lgteHitsMulti.doc(1).get(Globals.DOCUMENT_ID_FIELD)
              + ":"
              + lgteHitsMulti.score(1));
      System.out.println(
          "doc:"
              + lgteHitsMulti.doc(2).get(Globals.DOCUMENT_ID_FIELD)
              + ":"
              + lgteHitsMulti.score(2));

      assertEquals(
          lgteHits.doc(0).get(Globals.DOCUMENT_ID_FIELD),
          lgteHitsMulti.doc(0).get(Globals.DOCUMENT_ID_FIELD));
      assertEquals(
          lgteHits.doc(1).get(Globals.DOCUMENT_ID_FIELD),
          lgteHitsMulti.doc(1).get(Globals.DOCUMENT_ID_FIELD));
      assertEquals(
          lgteHits.doc(2).get(Globals.DOCUMENT_ID_FIELD),
          lgteHitsMulti.doc(2).get(Globals.DOCUMENT_ID_FIELD));

      assertEquals(lgteHits.score(0), lgteHitsMulti.score(0));
      assertEquals(lgteHits.score(1), lgteHitsMulti.score(1));
      assertEquals(lgteHits.score(2), lgteHitsMulti.score(2));
    } catch (ParseException e) {
      fail(e.toString());
    }
    searcher.close();
  }
  public void testTimeFilter() throws IOException, DocumentException, ParseException {
    LgteIndexWriter writer = new LgteIndexWriter(path + "Contents", true);
    LgteDocumentWrapper doc1 = new LgteDocumentWrapper();
    doc1.indexText(Globals.DOCUMENT_ID_FIELD, "1");
    doc1.indexText("contents", "word1 word2 word3");
    doc1.indexStringNoStore(Config.S_HAS_TIMEXES, "true");
    LgteDocumentWrapper doc2 = new LgteDocumentWrapper();
    doc2.indexText(Globals.DOCUMENT_ID_FIELD, "2");
    doc2.indexText("contents", "word2 word3 word4 word55 word96 word2 word54 word33 wordss");
    writer.addDocument(doc1);
    writer.addDocument(doc2);
    writer.close();

    LgteIndexWriter writer2 = new LgteIndexWriter(path + "Sentences", true);
    LgteDocumentWrapper sentence0 = new LgteDocumentWrapper();
    sentence0.indexText(Globals.DOCUMENT_ID_FIELD, "1_0");
    sentence0.indexText("doc_id", "1");
    sentence0.indexText("sentences", "word1 word3");
    LgteDocumentWrapper sentence1 = new LgteDocumentWrapper();
    sentence1.indexText(Globals.DOCUMENT_ID_FIELD, "1_1");
    sentence1.indexText("doc_id", "1");
    sentence1.indexText("sentences", "word1 word2 word3");
    LgteDocumentWrapper sentence2 = new LgteDocumentWrapper();
    sentence2.indexStringNoStore(Config.S_HAS_TIMEXES + "_sentences", "true");
    sentence2.indexText(Globals.DOCUMENT_ID_FIELD, "2_1");
    sentence2.indexText("doc_id", "2");
    sentence2.indexText("sentences", "word2 word3 word4 word55 word96 word2 word54 word33 wordss");
    writer2.addDocument(sentence0);
    writer2.addDocument(sentence1);
    writer2.addDocument(sentence2);
    writer2.close();

    IndexReader readerContents =
        LgteIndexManager.openReader(path + "Contents", Model.OkapiBM25Model);
    IndexReader readerSentences =
        LgteIndexManager.openReader(path + "Sentences", Model.OkapiBM25Model);
    Map<String, IndexReader> readers = new HashMap<String, IndexReader>();
    readers.put("contents", readerContents);
    readers.put("sentences", readerSentences);
    readers.put(Config.S_HAS_TIMEXES, readerContents);
    readers.put(Config.S_HAS_TIMEXES + "_sentences", readerSentences);
    readers.put("doc_id", readerSentences);
    readers.put("id", readerSentences);
    LgteIsolatedIndexReader lgteIsolatedIndexReader = new LgteIsolatedIndexReader(readers);
    lgteIsolatedIndexReader.addTreeMapping(readerContents, readerSentences, "doc_id");

    LgteIndexSearcherWrapper searcher =
        new LgteIndexSearcherWrapper(Model.OkapiBM25Model, lgteIsolatedIndexReader);
    QueryConfiguration queryConfiguration = new QueryConfiguration();
    queryConfiguration.setProperty("bm25.idf.policy", "floor_epslon");
    queryConfiguration.setProperty("bm25.idf.epslon", "0.01");
    queryConfiguration.setProperty("bm25.k1", "2.0");
    queryConfiguration.setProperty("bm25.b", "0.75");
    queryConfiguration.setProperty("index.tree", "true");
    QueryFilter queryFilter =
        new QueryFilter(
            org.apache.lucene.queryParser.QueryParser.parse(
                "true", Config.S_HAS_TIMEXES, new LgteNothingAnalyzer()));
    LgteQuery lgteQuery =
        LgteQueryParser.parseQuery(
            "sentences:word2", new LgteNothingAnalyzer(), searcher, queryConfiguration);
    LgteHits lgteHits = searcher.search(lgteQuery, queryFilter);

    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 1);

    TermsFilter termsFilter = new TermsFilter();
    termsFilter.addTerm(new Term(Config.S_HAS_TIMEXES, "true"));
    lgteHits = searcher.search(lgteQuery, termsFilter);
    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 1);

    termsFilter = new TermsFilter();
    termsFilter.addTerm(new Term(Config.S_HAS_TIMEXES + "_sentences", "true"));
    lgteHits = searcher.search(lgteQuery, termsFilter);
    assertEquals(lgteHits.length(), 1);
    assertEquals(lgteHits.id(0), 2);

    searcher.close();

    Files.delDirsE(path + "Contents");
    Files.delDirsE(path + "Sentences");
  }
Пример #3
0
  public void testRange() throws IOException, InvalidGeoException {
    LgteIndexSearcherWrapper searcher;

    if (lm) searcher = new LgteIndexSearcherWrapper(Model.LanguageModel, path);
    else searcher = new LgteIndexSearcherWrapper(Model.VectorSpaceModel, path);

    int years = 14;

    // Building query
    QueryParams queryParams = new QueryParams();
    queryParams.setTime("1990-6-8");
    queryParams.setRadiumYears(years);
    // create a term level1query to searchCallback against indexText documents
    // doc is a word to find in text
    Query tq = new TermQuery(new Term("metafile", "doc"));

    LgteQuery lgteQuery = new LgteQuery(tq, queryParams);
    TimeDistanceSortSource dsort = new TimeDistanceSortSource();
    LgteSort sort = new LgteSort(new SortField("foo", dsort));

    LgteHits hits = searcher.search(lgteQuery, sort);
    int results = hits.length();

    // Get a list of distances, you don't need this but we keep it available, our LgteHits gives you
    // Distance Information
    ITimeDistancesWrapper timeDistancesWrapper = dsort.getTimeDistancesWrapper();

    // distances calculated from filter first pass must be less than total
    // docs, from the above test of 6 items, 5 will come from the boundary box
    // filter, but only 5 are actually in the radius of the results.

    // Note Boundary Box filtering, is not accurate enough for most systems.

    System.out.println(
        "Distance Filter filtered: " + timeDistancesWrapper.getTimeDistances().size());
    System.out.println("Results: " + results);
    System.out.println("=============================");
    assertEquals(5, timeDistancesWrapper.getTimeDistances().size());
    assertEquals(5, results);

    int lastYears = 0;
    long lastMili = 0;
    for (int i = 0; i < results; i++) {
      LgteDocumentWrapper d = hits.doc(i);

      String name = d.get("name");

      int distanceYears = hits.timeDistanceYears(i);
      long distanceMili = hits.timeDistanceMiliseconds(i);
      assertTrue(distanceYears <= years);
      assertTrue(distanceYears >= lastYears);
      assertTrue(distanceMili >= lastMili);
      lastYears = distanceYears;
      lastMili = distanceMili;

      System.out.println(
          "Name: " + name + ", Distance (years, mili):" + distanceYears + " |" + distanceMili);

      switch (i) {
        case 0:
          assertTrue(d.get("name").equals("1"));
          break;
        case 1:
          assertTrue(d.get("name").equals("2"));
          break;
        case 2:
          assertTrue(d.get("name").equals("3"));
          break;
        case 3:
          assertTrue(d.get("name").equals("4"));
          break;
        case 4:
          assertTrue(d.get("name").equals("5"));
          break;
      }
    }
    searcher.close();
  }