Ejemplo n.º 1
0
    public void reduce(
        Text key,
        Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      boolean oldSet = false;
      boolean injectedSet = false;
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          injectedSet = true;
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;

      /**
       * Whether to overwrite, ignore or update existing records
       *
       * @see https://issues.apache.org/jira/browse/NUTCH-1405
       */

      // Injected record already exists and overwrite but not update
      if (injectedSet && oldSet && overwrite) {
        res = injected;

        if (update) {
          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
        }
      }

      // Injected record already exists and update but not overwrite
      if (injectedSet && oldSet && update && !overwrite) {
        res = old;
        old.putAllMetaData(injected);
        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
        old.setFetchInterval(
            injected.getFetchInterval() != interval
                ? injected.getFetchInterval()
                : old.getFetchInterval());
      }

      // Old default behaviour
      if (injectedSet && !oldSet) {
        res = injected;
      } else {
        res = old;
      }

      output.collect(key, res);
    }
Ejemplo n.º 2
0
      public void map(
          BytesWritable key,
          BytesWritable value,
          OutputCollector<BytesWritable, IntWritable> output,
          Reporter reporter)
          throws IOException {
        // newKey = (key, value)
        BytesWritable keyValue = new BytesWritable(pair(key, value));

        // output (newKey, value)
        output.collect(keyValue, this.value);
      }
Ejemplo n.º 3
0
      public void reduce(
          IntWritable key,
          Iterator<RecordStatsWritable> values,
          OutputCollector<IntWritable, RecordStatsWritable> output,
          Reporter reporter)
          throws IOException {
        long bytes = 0;
        long records = 0;
        int xor = 0;
        while (values.hasNext()) {
          RecordStatsWritable stats = values.next();
          bytes += stats.getBytes();
          records += stats.getRecords();
          xor ^= stats.getChecksum();
        }

        output.collect(key, new RecordStatsWritable(bytes, records, xor));
      }
Ejemplo n.º 4
0
  public void map(
      LongWritable key, Point value, OutputCollector<LongWritable, Point> output, Reporter reporter)
      throws IOException {
    double min = value.sumOfSquares(centers.get(0));
    int best = 0;

    for (int index = 1; index < numberOfCenters; ++index) {
      double current = value.sumOfSquares(centers.get(index));

      if (current < min) {
        min = current;
        best = index;
      }
    }

    reporter.incrCounter("NUMBER", "NODES", 1);
    reporter.incrCounter("CENTER", "" + best, 1);

    output.collect(new LongWritable(best), value);
  }
Ejemplo n.º 5
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }
Ejemplo n.º 6
0
      @SuppressWarnings("unchecked")
      public void map(
          WritableComparable key,
          Writable value,
          OutputCollector<IntWritable, RecordStatsWritable> output,
          Reporter reporter)
          throws IOException {
        // Set up rawKey and rawValue on the first call to 'map'
        if (recordId == -1) {
          rawKey = createRaw(key.getClass());
          rawValue = createRaw(value.getClass());
        }
        ++recordId;

        if (this.key == sortOutput) {
          // Check if keys are 'sorted' if this
          // record is from sort's output
          if (prevKey == null) {
            prevKey = key;
            keyClass = prevKey.getClass();
          } else {
            // Sanity check
            if (keyClass != key.getClass()) {
              throw new IOException(
                  "Type mismatch in key: expected "
                      + keyClass.getName()
                      + ", recieved "
                      + key.getClass().getName());
            }

            // Check if they were sorted correctly
            if (prevKey.compareTo(key) > 0) {
              throw new IOException(
                  "The 'map-reduce' framework wrongly"
                      + " classifed ("
                      + prevKey
                      + ") > ("
                      + key
                      + ") "
                      + "for record# "
                      + recordId);
            }
            prevKey = key;
          }

          // Check if the sorted output is 'partitioned' right
          int keyPartition = partitioner.getPartition(key, value, noSortReducers);
          if (partition != keyPartition) {
            throw new IOException(
                "Partitions do not match for record# "
                    + recordId
                    + " ! - '"
                    + partition
                    + "' v/s '"
                    + keyPartition
                    + "'");
          }
        }

        // Construct the record-stats and output (this.key, record-stats)
        byte[] keyBytes = rawKey.getRawBytes(key);
        int keyBytesLen = rawKey.getRawBytesLength(key);
        byte[] valueBytes = rawValue.getRawBytes(value);
        int valueBytesLen = rawValue.getRawBytesLength(value);

        int keyValueChecksum =
            (WritableComparator.hashBytes(keyBytes, keyBytesLen)
                ^ WritableComparator.hashBytes(valueBytes, valueBytesLen));

        output.collect(
            this.key, new RecordStatsWritable((keyBytesLen + valueBytesLen), 1, keyValueChecksum));
      }