Пример #1
0
    protected void cleanup(Context context) throws IOException, InterruptedException {

      Map<Text, IntWritable> sortedMap = sortByValues(countMap);

      int counter = 0;
      for (Text key : sortedMap.keySet()) {
        if (counter++ == 100) {
          break;
        }
        context.write(key, sortedMap.get(key));
      }
    }
  /**
   * Test how IPC cache map works.
   *
   * @throws Exception If failed.
   */
  @SuppressWarnings("unchecked")
  public void testIpcCache() throws Exception {
    Field cacheField = GridGgfsHadoopIpcIo.class.getDeclaredField("ipcCache");

    cacheField.setAccessible(true);

    Field activeCntField = GridGgfsHadoopIpcIo.class.getDeclaredField("activeCnt");

    activeCntField.setAccessible(true);

    Map<String, GridGgfsHadoopIpcIo> cache =
        (Map<String, GridGgfsHadoopIpcIo>) cacheField.get(null);

    String name = "ggfs:" + getTestGridName(0) + "@";

    Configuration cfg = new Configuration();

    cfg.addResource(U.resolveGridGainUrl(HADOOP_FS_CFG));
    cfg.setBoolean("fs.ggfs.impl.disable.cache", true);
    cfg.setBoolean(String.format(GridGgfsHadoopUtils.PARAM_GGFS_ENDPOINT_NO_EMBED, name), true);

    // Ensure that existing IO is reused.
    FileSystem fs1 = FileSystem.get(new URI("ggfs://" + name + "/"), cfg);

    assertEquals(1, cache.size());

    GridGgfsHadoopIpcIo io = null;

    System.out.println("CACHE: " + cache);

    for (String key : cache.keySet()) {
      if (key.contains("10500")) {
        io = cache.get(key);

        break;
      }
    }

    assert io != null;

    assertEquals(1, ((AtomicInteger) activeCntField.get(io)).get());

    // Ensure that when IO is used by multiple file systems and one of them is closed, IO is not
    // stopped.
    FileSystem fs2 = FileSystem.get(new URI("ggfs://" + name + "/abc"), cfg);

    assertEquals(1, cache.size());
    assertEquals(2, ((AtomicInteger) activeCntField.get(io)).get());

    fs2.close();

    assertEquals(1, cache.size());
    assertEquals(1, ((AtomicInteger) activeCntField.get(io)).get());

    Field stopField = GridGgfsHadoopIpcIo.class.getDeclaredField("stopping");

    stopField.setAccessible(true);

    assert !(Boolean) stopField.get(io);

    // Ensure that IO is stopped when nobody else is need it.
    fs1.close();

    assert cache.isEmpty();

    assert (Boolean) stopField.get(io);
  }
Пример #3
0
  /**
   * Search for pages matching a query, eliminating excessive hits with matching values for a named
   * field. Hits after the first <code>maxHitsPerDup</code> are removed from results. The remaining
   * hits have {@link Hit#moreFromDupExcluded()} set.
   *
   * <p>If maxHitsPerDup is zero then all hits are returned.
   *
   * @param query query
   * @param numHits number of requested hits
   * @param maxHitsPerDup the maximum hits returned with matching values, or zero
   * @param dedupField field name to check for duplicates
   * @param sortField Field to sort on (or null if no sorting).
   * @param reverse True if we are to reverse sort by <code>sortField</code>.
   * @return Hits the matching hits
   * @throws IOException
   */
  public Hits search(
      Query query,
      int numHits,
      int maxHitsPerDup,
      String dedupField,
      String sortField,
      boolean reverse)
      throws IOException {
    if (maxHitsPerDup <= 0) // disable dup checking
    return search(query, numHits, dedupField, sortField, reverse);

    final float rawHitsFactor = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
    int numHitsRaw = (int) (numHits * rawHitsFactor);
    if (LOG.isInfoEnabled()) {
      LOG.info("searching for " + numHitsRaw + " raw hits");
    }
    Hits hits = searchBean.search(query, numHitsRaw, dedupField, sortField, reverse);
    final long total = hits.getTotal();
    final Map<String, DupHits> dupToHits = new HashMap<String, DupHits>();
    final List<Hit> resultList = new ArrayList<Hit>();
    final Set<Hit> seen = new HashSet<Hit>();
    final List<String> excludedValues = new ArrayList<String>();
    boolean totalIsExact = true;
    for (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) {
      // get the next raw hit
      if (rawHitNum >= hits.getLength()) {
        // optimize query by prohibiting more matches on some excluded values
        final Query optQuery = (Query) query.clone();
        for (int i = 0; i < excludedValues.size(); i++) {
          if (i == MAX_PROHIBITED_TERMS) break;
          optQuery.addProhibitedTerm(excludedValues.get(i), dedupField);
        }
        numHitsRaw = (int) (numHitsRaw * rawHitsFactor);
        if (LOG.isInfoEnabled()) {
          LOG.info("re-searching for " + numHitsRaw + " raw hits, query: " + optQuery);
        }
        hits = searchBean.search(optQuery, numHitsRaw, dedupField, sortField, reverse);
        if (LOG.isInfoEnabled()) {
          LOG.info("found " + hits.getTotal() + " raw hits");
        }
        rawHitNum = -1;
        continue;
      }

      final Hit hit = hits.getHit(rawHitNum);
      if (seen.contains(hit)) continue;
      seen.add(hit);

      // get dup hits for its value
      final String value = hit.getDedupValue();
      DupHits dupHits = dupToHits.get(value);
      if (dupHits == null) dupToHits.put(value, dupHits = new DupHits());

      // does this hit exceed maxHitsPerDup?
      if (dupHits.size() == maxHitsPerDup) { // yes -- ignore the hit
        if (!dupHits.maxSizeExceeded) {

          // mark prior hits with moreFromDupExcluded
          for (int i = 0; i < dupHits.size(); i++) {
            dupHits.get(i).setMoreFromDupExcluded(true);
          }
          dupHits.maxSizeExceeded = true;

          excludedValues.add(value); // exclude dup
        }
        totalIsExact = false;
      } else { // no -- collect the hit
        resultList.add(hit);
        dupHits.add(hit);

        // are we done?
        // we need to find one more than asked for, so that we can tell if
        // there are more hits to be shown
        if (resultList.size() > numHits) break;
      }
    }

    final Hits results = new Hits(total, resultList.toArray(new Hit[resultList.size()]));
    results.setTotalIsExact(totalIsExact);
    return results;
  }
Пример #4
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }