Exemplo n.º 1
0
  public static void generateData1() throws Exception {
    System.out.println("generate data.");

    Indexer<String> wordIndexer = new Indexer<String>();
    TextFileWriter writer = new TextFileWriter(DATA_FILE);

    List<SparseVector> data = new ArrayList<SparseVector>();

    TextFileReader reader = new TextFileReader(MIRPath.TREC_CDS_QUERY_DOC_FILE);
    while (reader.hasNext()) {
      List<String> lines = reader.getNextLines();
      List<SparseVector> svs = new ArrayList<SparseVector>();
      int qid = -1;
      for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        String[] parts = line.split("\t");

        double relevance = -1;

        if (i == 0) {
          qid = Integer.parseInt(parts[1]);
        } else {
          relevance = Double.parseDouble(parts[1]);
        }

        StrCounter c = new StrCounter();
        String[] toks = parts[2].split(" ");
        for (int j = 0; j < toks.length; j++) {
          String[] two = StrUtils.split2Two(":", toks[j]);
          c.incrementCount(two[0], Double.parseDouble(two[1]));
        }

        SparseVector sv = VectorUtils.toSparseVector(c, wordIndexer, true);

        if (i > 0) {
          sv.setLabel((int) relevance);
        }

        svs.add(sv);
      }

      SparseVector q = svs.get(0);

      for (int i = 1; i < svs.size(); i++) {
        SparseVector d = svs.get(i);
        SparseVector qd = VectorMath.add(q, d);
        qd.setLabel(d.label());

        data.add(qd);
      }
    }
    reader.close();
    writer.close();

    SparseVector.write(DATA_FILE, data);
    IOUtils.write(WORD_INDEXER_FILE, wordIndexer);
  }
Exemplo n.º 2
0
  public static void generateData2() throws Exception {
    System.out.println("generate data.");

    List<BaseQuery> bqs = QueryReader.readTrecCdsQueries(MIRPath.TREC_CDS_QUERY_2014_FILE);

    Map<Integer, BaseQuery> queryMap = new HashMap<Integer, BaseQuery>();

    for (BaseQuery bq : bqs) {
      int qid = Integer.parseInt(bq.getId());
      queryMap.put(qid, bq);
    }

    Indexer<String> wordIndexer = new Indexer<String>();
    Indexer<String> typeIndexer = new Indexer<String>();

    TextFileWriter writer = new TextFileWriter(DATA_FILE);

    List<SparseVector> data = new ArrayList<SparseVector>();

    TextFileReader reader = new TextFileReader(MIRPath.TREC_CDS_QUERY_DOC_FILE);
    while (reader.hasNext()) {
      List<String> lines = reader.getNextLines();
      List<SparseVector> svs = new ArrayList<SparseVector>();
      int qid = -1;

      for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        String[] parts = line.split("\t");

        double relevance = -1;

        if (i == 0) {
          qid = Integer.parseInt(parts[1]);
        } else {
          relevance = Double.parseDouble(parts[1]);
        }

        StrCounter c = new StrCounter();
        String[] toks = parts[2].split(" ");
        for (int j = 0; j < toks.length; j++) {
          String[] two = StrUtils.split2Two(":", toks[j]);
          c.incrementCount(two[0], Double.parseDouble(two[1]));
        }

        SparseVector sv = VectorUtils.toSparseVector(c, wordIndexer, true);

        if (i > 0) {
          sv.setLabel((int) relevance);
        }

        svs.add(sv);
      }

      TrecCdsQuery tcq = (TrecCdsQuery) queryMap.get(qid);
      String type = tcq.getType();
      int typeId = typeIndexer.getIndex(type);

      // SparseVector q = svs.get(0);

      for (int i = 1; i < svs.size(); i++) {
        SparseVector d = svs.get(i);
        double relevance = d.label();

        if (relevance > 0) {
          d.setLabel(typeId);
        } else {
          d.setLabel(3);
        }

        // SparseVector qd = VectorMath.add(q, d);
        // qd.setLabel(d.label());

        data.add(d);
      }
    }
    reader.close();
    writer.close();

    SparseVector.write(DATA_FILE, data);
    IOUtils.write(WORD_INDEXER_FILE, wordIndexer);
  }