Beispiel #1
0
  public void collectTermInfo(
      LeafReader leafReader, List<JATETerm> terms, String ngramInfoFieldname, String idFieldname)
      throws IOException {
    Logger log = Logger.getLogger(this.getClass().getName());
    TermInfoCollector infoCollector =
        new TermInfoCollector(leafReader, ngramInfoFieldname, idFieldname);

    log.info(
        "Gathering term information (e.g., provenance and offsets). This may take a while. Total="
            + terms.size());
    int count = 0;
    for (JATETerm jt : terms) {
      jt.setTermInfo(infoCollector.collect(jt.getString()));
      count++;
      if (count % 500 == 0) log.info("done " + count);
    }
  }
Beispiel #2
0
  protected static List<JATETerm> applyThresholds(List<JATETerm> terms, String t, String n) {
    List<JATETerm> selected = new ArrayList<>();
    if (t != null) {
      try {
        double threshold = Double.valueOf(t);
        for (JATETerm jt : terms) {
          if (jt.getScore() >= threshold) selected.add(jt);
          else break;
        }
      } catch (NumberFormatException nfe) {
      }
    }

    if (n == null && selected.size() > 0) return selected;

    if (selected.size() == 0) selected.addAll(terms);
    double topN;
    try {
      topN = Integer.valueOf(n);
      Iterator<JATETerm> it = selected.iterator();
      int count = 0;
      while (it.hasNext()) {
        it.next();
        count++;
        if (count > topN) it.remove();
      }
    } catch (NumberFormatException nfe) {
      try {
        topN = Double.valueOf(n);
      } catch (NumberFormatException nfe2) {
        topN = DEFAULT_THRESHOLD_N;
      }
      int topNInteger = (int) (topN * terms.size());
      Iterator<JATETerm> it = selected.iterator();
      int count = 0;
      while (it.hasNext()) {
        it.next();
        count++;
        if (count > topNInteger) it.remove();
      }
    }
    return selected;
  }