public static void generateData1() throws Exception { System.out.println("generate data."); Indexer<String> wordIndexer = new Indexer<String>(); TextFileWriter writer = new TextFileWriter(DATA_FILE); List<SparseVector> data = new ArrayList<SparseVector>(); TextFileReader reader = new TextFileReader(MIRPath.TREC_CDS_QUERY_DOC_FILE); while (reader.hasNext()) { List<String> lines = reader.getNextLines(); List<SparseVector> svs = new ArrayList<SparseVector>(); int qid = -1; for (int i = 0; i < lines.size(); i++) { String line = lines.get(i); String[] parts = line.split("\t"); double relevance = -1; if (i == 0) { qid = Integer.parseInt(parts[1]); } else { relevance = Double.parseDouble(parts[1]); } StrCounter c = new StrCounter(); String[] toks = parts[2].split(" "); for (int j = 0; j < toks.length; j++) { String[] two = StrUtils.split2Two(":", toks[j]); c.incrementCount(two[0], Double.parseDouble(two[1])); } SparseVector sv = VectorUtils.toSparseVector(c, wordIndexer, true); if (i > 0) { sv.setLabel((int) relevance); } svs.add(sv); } SparseVector q = svs.get(0); for (int i = 1; i < svs.size(); i++) { SparseVector d = svs.get(i); SparseVector qd = VectorMath.add(q, d); qd.setLabel(d.label()); data.add(qd); } } reader.close(); writer.close(); SparseVector.write(DATA_FILE, data); IOUtils.write(WORD_INDEXER_FILE, wordIndexer); }
public static void generateData2() throws Exception { System.out.println("generate data."); List<BaseQuery> bqs = QueryReader.readTrecCdsQueries(MIRPath.TREC_CDS_QUERY_2014_FILE); Map<Integer, BaseQuery> queryMap = new HashMap<Integer, BaseQuery>(); for (BaseQuery bq : bqs) { int qid = Integer.parseInt(bq.getId()); queryMap.put(qid, bq); } Indexer<String> wordIndexer = new Indexer<String>(); Indexer<String> typeIndexer = new Indexer<String>(); TextFileWriter writer = new TextFileWriter(DATA_FILE); List<SparseVector> data = new ArrayList<SparseVector>(); TextFileReader reader = new TextFileReader(MIRPath.TREC_CDS_QUERY_DOC_FILE); while (reader.hasNext()) { List<String> lines = reader.getNextLines(); List<SparseVector> svs = new ArrayList<SparseVector>(); int qid = -1; for (int i = 0; i < lines.size(); i++) { String line = lines.get(i); String[] parts = line.split("\t"); double relevance = -1; if (i == 0) { qid = Integer.parseInt(parts[1]); } else { relevance = Double.parseDouble(parts[1]); } StrCounter c = new StrCounter(); String[] toks = parts[2].split(" "); for (int j = 0; j < toks.length; j++) { String[] two = StrUtils.split2Two(":", toks[j]); c.incrementCount(two[0], Double.parseDouble(two[1])); } SparseVector sv = VectorUtils.toSparseVector(c, wordIndexer, true); if (i > 0) { sv.setLabel((int) relevance); } svs.add(sv); } TrecCdsQuery tcq = (TrecCdsQuery) queryMap.get(qid); String type = tcq.getType(); int typeId = typeIndexer.getIndex(type); // SparseVector q = svs.get(0); for (int i = 1; i < svs.size(); i++) { SparseVector d = svs.get(i); double relevance = d.label(); if (relevance > 0) { d.setLabel(typeId); } else { d.setLabel(3); } // SparseVector qd = VectorMath.add(q, d); // qd.setLabel(d.label()); data.add(d); } } reader.close(); writer.close(); SparseVector.write(DATA_FILE, data); IOUtils.write(WORD_INDEXER_FILE, wordIndexer); }