예제 #1
0
  public void collect() throws Exception {
    String[] queryFileNames = MIRPath.QueryFileNames;

    String[] indexDirNames = MIRPath.IndexDirNames;

    String[] relDataFileNames = MIRPath.RelevanceFileNames;

    String[] docMapFileNames = MIRPath.DocIdMapFileNames;

    String[] queryDocFileNames = MIRPath.QueryDocFileNames;

    IndexSearcher[] indexSearchers = SearcherUtils.getIndexSearchers(indexDirNames);

    Analyzer analyzer = MedicalEnglishAnalyzer.getAnalyzer();

    for (int i = 0; i < queryFileNames.length; i++) {
      List<BaseQuery> bqs = QueryReader.readQueries(queryFileNames[i]);
      CounterMap<String, String> queryRels = RelevanceReader.readRelevances(relDataFileNames[i]);

      List<Counter<String>> qs = new ArrayList<Counter<String>>();

      for (int j = 0; j < bqs.size(); j++) {
        BaseQuery bq = bqs.get(j);
        qs.add(AnalyzerUtils.getWordCounts(bq.getSearchText(), analyzer));
      }

      BidMap<String, String> docIdMap = DocumentIdMapper.readDocumentIdMap(docMapFileNames[i]);

      // queryRelevances = RelevanceReader.filter(queryRelevances, docIdMap);

      // baseQueries = QueryReader.filter(baseQueries, queryRelevances);

      List<SparseVector> docRelData =
          DocumentIdMapper.mapDocIdsToIndexIds(bqs, queryRels, docIdMap);

      IndexReader indexReader = indexSearchers[i].getIndexReader();

      if (bqs.size() != docRelData.size()) {
        throw new Exception();
      }

      TextFileWriter writer = new TextFileWriter(queryDocFileNames[i]);

      for (int j = 0; j < bqs.size(); j++) {
        BaseQuery bq = bqs.get(j);
        SparseVector docRels = docRelData.get(j);

        Indexer<String> wordIndexer = new Indexer<String>();

        Counter<String> qwcs = qs.get(j);

        SparseVector q = VectorUtils.toSparseVector(qs.get(j), wordIndexer, true);

        {
          SparseVector docFreqs =
              VectorUtils.toSparseVector(
                  WordCountBox.getDocFreqs(indexReader, IndexFieldName.CONTENT, qs.get(j).keySet()),
                  wordIndexer,
                  true);
          computeTFIDFs(q, docFreqs, indexReader.maxDoc());
        }

        WordCountBox wcb = WordCountBox.getWordCountBox(indexReader, docRels, wordIndexer);
        SparseMatrix sm = wcb.getDocWordCounts();
        SparseVector docFreqs = wcb.getCollDocFreqs();

        for (int k = 0; k < sm.rowSize(); k++) {
          int docId = sm.indexAtRowLoc(k);
          SparseVector sv = sm.vectorAtRowLoc(k);
          computeTFIDFs(sv, docFreqs, wcb.getNumDocsInCollection());
        }

        writer.write(
            String.format(
                "#Query\t%d\t%s\n", j + 1, toString(VectorUtils.toCounter(q, wordIndexer))));

        docRels.sortByValue();

        for (int k = 0; k < docRels.size(); k++) {
          int docId = docRels.indexAtLoc(k);
          double rel = docRels.valueAtLoc(k);
          SparseVector sv = sm.rowAlways(docId);

          if (sv.size() == 0) {
            continue;
          }

          writer.write(
              String.format(
                  "%d\t%d\t%s\n",
                  docId, (int) rel, toString(VectorUtils.toCounter(sv, wordIndexer))));
        }
        writer.write("\n");
      }
    }
  }
예제 #2
0
  public void collect2() throws Exception {
    String[] queryFileNames = MIRPath.QueryFileNames;

    String[] indexDirNames = MIRPath.IndexDirNames;

    String[] relDataFileNames = MIRPath.RelevanceFileNames;

    String[] docMapFileNames = MIRPath.DocIdMapFileNames;

    String[] queryDocFileNames = MIRPath.QueryDocFileNames;

    IndexSearcher[] indexSearchers = SearcherUtils.getIndexSearchers(indexDirNames);

    Analyzer analyzer = MedicalEnglishAnalyzer.getAnalyzer();

    for (int i = 0; i < queryFileNames.length; i++) {
      List<BaseQuery> bqs = new ArrayList<BaseQuery>();
      CounterMap<String, String> queryRels = new CounterMap<String, String>();

      File queryFile = new File(queryFileNames[i]);
      File relvFile = new File(relDataFileNames[i]);

      if (i == 0) {
        bqs = QueryReader.readTrecCdsQueries(queryFileNames[i]);
        queryRels = RelevanceReader.readTrecCdsRelevances(relDataFileNames[i]);
      } else if (i == 1) {
        bqs = QueryReader.readClefEHealthQueries(queryFileNames[i]);
        queryRels = RelevanceReader.readClefEHealthRelevances(relDataFileNames[i]);
      } else if (i == 2) {
        bqs = QueryReader.readOhsumedQueries(queryFileNames[i]);
        queryRels = RelevanceReader.readOhsumedRelevances(relDataFileNames[i]);
      } else if (i == 3) {
        bqs = QueryReader.readTrecGenomicsQueries(queryFileNames[i]);
        queryRels = RelevanceReader.readTrecGenomicsRelevances(relDataFileNames[i]);
      }

      List<Counter<String>> qs = new ArrayList<Counter<String>>();

      for (int j = 0; j < bqs.size(); j++) {
        BaseQuery bq = bqs.get(j);
        qs.add(AnalyzerUtils.getWordCounts(bq.getSearchText(), analyzer));
      }

      BidMap<String, String> docIdMap = DocumentIdMapper.readDocumentIdMap(docMapFileNames[i]);

      // queryRelevances = RelevanceReader.filter(queryRelevances, docIdMap);

      // baseQueries = QueryReader.filter(baseQueries, queryRelevances);

      List<SparseVector> docRelData =
          DocumentIdMapper.mapDocIdsToIndexIds(bqs, queryRels, docIdMap);

      IndexReader indexReader = indexSearchers[i].getIndexReader();

      if (bqs.size() != docRelData.size()) {
        throw new Exception();
      }

      TextFileWriter writer = new TextFileWriter(queryDocFileNames[i]);

      for (int j = 0; j < bqs.size(); j++) {
        BaseQuery bq = bqs.get(j);
        SparseVector docRels = docRelData.get(j);

        Indexer<String> wordIndexer = new Indexer<String>();

        Counter<String> qwcs = qs.get(j);

        SparseVector q = VectorUtils.toSparseVector(qs.get(j), wordIndexer, true);

        {
          SparseVector docFreqs =
              VectorUtils.toSparseVector(
                  WordCountBox.getDocFreqs(indexReader, IndexFieldName.CONTENT, qs.get(j).keySet()),
                  wordIndexer,
                  true);
          computeTFIDFs(q, docFreqs, indexReader.maxDoc());
        }

        WordCountBox wcb = WordCountBox.getWordCountBox(indexReader, docRels, wordIndexer);
        SparseMatrix sm = wcb.getDocWordCounts();
        SparseVector docFreqs = wcb.getCollDocFreqs();

        for (int k = 0; k < sm.rowSize(); k++) {
          int docId = sm.indexAtRowLoc(k);
          SparseVector sv = sm.vectorAtRowLoc(k);
          computeTFIDFs(sv, docFreqs, wcb.getNumDocsInCollection());
        }

        writer.write(String.format("#Query\t%d\t%s\n", j + 1, bq.toString()));
        writer.write(
            String.format("#Query Words\t%s\n", toString(VectorUtils.toCounter(q, wordIndexer))));

        docRels.sortByValue();

        for (int k = 0; k < docRels.size(); k++) {
          int docId = docRels.indexAtLoc(k);
          double rel = docRels.valueAtLoc(k);

          if (rel == 0) {
            continue;
          }

          Document doc = indexReader.document(docId);

          List<Integer> ws = wcb.getDocWords().get(docId);

          StringBuffer sb = new StringBuffer();

          for (int l = 0; l < ws.size(); l++) {
            int w = ws.get(l);
            boolean found = false;
            if (q.location(w) > -1) {
              found = true;
            }

            if (found) {
              sb.append(
                  String.format(
                      "%d\t%s\t%s\n", l + 1, wordIndexer.getObject(w), found ? 1 + "" : ""));
            }
          }

          String content = doc.get(IndexFieldName.CONTENT);

          SparseVector sv = sm.rowAlways(docId);

          if (sv.size() == 0) {
            continue;
          }

          writer.write(String.format("DOC-ID\t%d\nRelevance\t%d\n", docId, (int) rel));
          writer.write(String.format("Loc\tWord\tMark\n%s\n", sb.toString()));
        }
        writer.write("\n");
      }
    }
  }