/** Parse the given string, return resulting data if appropriate. */
  ParseResult internalParse(
      String s, Map<String, Integer> targetUnionDecisions, boolean mustConsumeStr) {
    //
    // If there's no target decision, then go ahead and try all branches.
    //
    if (targetUnionDecisions == null || targetUnionDecisions.get(name) == null) {
      for (InferredType subelt : unionTypes) {
        ParseResult pr = subelt.internalParse(s, targetUnionDecisions, false);
        if (pr != null
            && (!mustConsumeStr
                || (mustConsumeStr && pr.getRemainingString().trim().length() == 0))) {
          return new ParseResult(pr.getData(), pr.hasData(), pr.getRemainingString());
        }
      }
      return null;
    }

    //
    // If there is a target decision, then carry it out.
    //
    InferredType subelt = unionTypes.get(targetUnionDecisions.get(name));
    ParseResult pr = subelt.internalParse(s, targetUnionDecisions, false);
    if (pr != null
        && (!mustConsumeStr || (mustConsumeStr && pr.getRemainingString().trim().length() == 0))) {
      return new ParseResult(pr.getData(), pr.hasData(), pr.getRemainingString());
    }
    return null;
  }
Beispiel #2
0
  public MapWritable toMapWritableKey() {
    MapWritable mw = new MapWritable();
    MapWritable bucketMap = new MapWritable();

    for (IntWritable key : bucketCache.keySet()) {
      bucketMap.put(key, bucketCache.get(key));
    }
    mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap);

    return mw;
  }
Beispiel #3
0
    protected void cleanup(Context context) throws IOException, InterruptedException {

      Map<Text, IntWritable> sortedMap = sortByValues(countMap);

      int counter = 0;
      for (Text key : sortedMap.keySet()) {
        if (counter++ == 100) {
          break;
        }
        context.write(key, sortedMap.get(key));
      }
    }
Beispiel #4
0
 public void reduce(Text key, Iterable<IntWritable> values, Context context)
     throws IOException, InterruptedException {
   int sum = 0;
   for (IntWritable val : values) {
     sum += val.get();
   }
   countMap.put(new Text(key), new IntWritable(sum));
 }
Beispiel #5
0
  public BucketCache(Configuration conf) throws IOException {
    bucketCache = new HashMap<IntWritable, Bucket>();

    for (String cachePath : PathUtils.getCachePaths(conf)) {
      String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER;
      MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf);
      IntWritable key = new IntWritable();
      Bucket value = new Bucket();
      while (reader.next(key, value)) {
        bucketCache.put(new IntWritable(key.get()), new Bucket(value));
      }
    }

    for (IntWritable i : bucketCache.keySet()) {
      System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i));
    }
  }
Beispiel #6
0
    public static <K extends Text, V extends IntWritable> Map<K, V> sortByValues(Map<K, V> map) {
      List<Map.Entry<K, V>> entries = new LinkedList<Map.Entry<K, V>>(map.entrySet());

      Collections.sort(
          entries,
          new Comparator<Map.Entry<K, V>>() {

            public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
              return o2.getValue().compareTo(o1.getValue());
            }
          });

      Map<K, V> sortedMap = new LinkedHashMap<K, V>();

      for (Map.Entry<K, V> entry : entries) {
        sortedMap.put(entry.getKey(), entry.getValue());
      }

      return sortedMap;
    }
Beispiel #7
0
  @SuppressWarnings("unchecked")
  public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException {
    String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER;

    FileSystem fs = FileSystem.get(conf);
    MapFile.Writer writer = null;

    try {
      writer =
          new MapFile.Writer(
              conf,
              new Path(bucketCachePath),
              MapFile.Writer.keyClass(IntWritable.class),
              MapFile.Writer.valueClass(Bucket.class));

      ArrayList<IntWritable> keyList = new ArrayList<IntWritable>();
      for (IntWritable i : bucketCache.keySet()) {
        keyList.add(i);
      }

      Collections.sort(keyList);
      for (IntWritable i : keyList) {
        writer.append(i, bucketCache.get(i));
      }
    } finally {
      if (writer != null) {
        IOUtils.closeStream(writer);
      }
    }

    if (writeToDistributedCache) {
      for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) {
        if (!status.isDirectory()) {
          DistributedCache.addCacheFile(status.getPath().toUri(), conf);
        }
      }
    }
  }
    public void map(
        Text key,
        Text val,
        org.apache.hadoop.mapreduce.Mapper<Text, Text, Text, Text>.Context context)
        throws IOException, InterruptedException {
      int i = 0, n = 0, j = 0, lj = 0, hj = 0;
      String tem = "";

      initStopWordsMap(); // initialize  the stop list
      String line = val.toString();
      StringTokenizer itr =
          new StringTokenizer(line.toLowerCase(), tokenDelimiter); // set delimiter
      n = itr.countTokens();
      cache = new String[n];
      for (i = 0; i < n; i++) {
        cache[i] = new String(""); // initialize the cache
      }
      i = 0;
      while (itr.hasMoreTokens()) {
        cache[i] = itr.nextToken(); // padding the cache with the words of the content
        i++;
      }
      for (i = 0; i < n; i++) {
        keyWord = cache[i];
        keyWord = keyWord.trim();
        if (!hmStopWord.containsKey(keyWord)) {
          lj = i - 10;
          hj = i + 10;
          if (lj < 0) lj = 0;
          if (hj > n) hj = n;
          tem = " ";
          for (j = lj; j < hj; j++) tem += cache[j] + " ";
          location = new Text();
          location.set(key.toString() + tem);
          context.write(new Text(keyWord), location);
        }
      }
    }
  /*
   * Init of stop words hash map
   */
  public static void initStopWordsMap() {

    for (int i = 0; i < stopWords.length; i++) hmStopWord.put(stopWords[i], null);
  }
Beispiel #10
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }
Beispiel #11
0
 public Text getBucketName(BucketStripped bucketStripped) {
   Bucket bucket = bucketCache.get(bucketStripped.getCacheHash());
   return bucket.getBucketName();
 }
Beispiel #12
0
 public Bucket getBucket(BucketStripped bucketStripped) throws IOException {
   Bucket bucket = new Bucket(bucketCache.get(bucketStripped.getCacheHash()));
   bucket.setBucketValue(new Text(bucketStripped.getBucketValue()));
   bucket.computeHash();
   return bucket;
 }
Beispiel #13
0
 public void addBucket(Bucket bucket) {
   int index = BitmapIndex.getBucketCacheIndex(bucket);
   bucketCache.put(new IntWritable(index), bucket);
 }