/** Parse the given string, return resulting data if appropriate. */ ParseResult internalParse( String s, Map<String, Integer> targetUnionDecisions, boolean mustConsumeStr) { // // If there's no target decision, then go ahead and try all branches. // if (targetUnionDecisions == null || targetUnionDecisions.get(name) == null) { for (InferredType subelt : unionTypes) { ParseResult pr = subelt.internalParse(s, targetUnionDecisions, false); if (pr != null && (!mustConsumeStr || (mustConsumeStr && pr.getRemainingString().trim().length() == 0))) { return new ParseResult(pr.getData(), pr.hasData(), pr.getRemainingString()); } } return null; } // // If there is a target decision, then carry it out. // InferredType subelt = unionTypes.get(targetUnionDecisions.get(name)); ParseResult pr = subelt.internalParse(s, targetUnionDecisions, false); if (pr != null && (!mustConsumeStr || (mustConsumeStr && pr.getRemainingString().trim().length() == 0))) { return new ParseResult(pr.getData(), pr.hasData(), pr.getRemainingString()); } return null; }
public MapWritable toMapWritableKey() { MapWritable mw = new MapWritable(); MapWritable bucketMap = new MapWritable(); for (IntWritable key : bucketCache.keySet()) { bucketMap.put(key, bucketCache.get(key)); } mw.put(MAPWRITABLE_BUCKET_KEY, bucketMap); return mw; }
protected void cleanup(Context context) throws IOException, InterruptedException { Map<Text, IntWritable> sortedMap = sortByValues(countMap); int counter = 0; for (Text key : sortedMap.keySet()) { if (counter++ == 100) { break; } context.write(key, sortedMap.get(key)); } }
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } countMap.put(new Text(key), new IntWritable(sum)); }
public BucketCache(Configuration conf) throws IOException { bucketCache = new HashMap<IntWritable, Bucket>(); for (String cachePath : PathUtils.getCachePaths(conf)) { String bucketCachePath = cachePath + BUCKET_CACHE_FOLDER; MapFile.Reader reader = new MapFile.Reader(new Path(bucketCachePath), conf); IntWritable key = new IntWritable(); Bucket value = new Bucket(); while (reader.next(key, value)) { bucketCache.put(new IntWritable(key.get()), new Bucket(value)); } } for (IntWritable i : bucketCache.keySet()) { System.out.println("Loaded bucket from cache:" + i.get() + ":" + bucketCache.get(i)); } }
public static <K extends Text, V extends IntWritable> Map<K, V> sortByValues(Map<K, V> map) { List<Map.Entry<K, V>> entries = new LinkedList<Map.Entry<K, V>>(map.entrySet()); Collections.sort( entries, new Comparator<Map.Entry<K, V>>() { public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) { return o2.getValue().compareTo(o1.getValue()); } }); Map<K, V> sortedMap = new LinkedHashMap<K, V>(); for (Map.Entry<K, V> entry : entries) { sortedMap.put(entry.getKey(), entry.getValue()); } return sortedMap; }
@SuppressWarnings("unchecked") public void writeToDisk(Configuration conf, boolean writeToDistributedCache) throws IOException { String bucketCachePath = PathUtils.getCachePath(conf) + BUCKET_CACHE_FOLDER; FileSystem fs = FileSystem.get(conf); MapFile.Writer writer = null; try { writer = new MapFile.Writer( conf, new Path(bucketCachePath), MapFile.Writer.keyClass(IntWritable.class), MapFile.Writer.valueClass(Bucket.class)); ArrayList<IntWritable> keyList = new ArrayList<IntWritable>(); for (IntWritable i : bucketCache.keySet()) { keyList.add(i); } Collections.sort(keyList); for (IntWritable i : keyList) { writer.append(i, bucketCache.get(i)); } } finally { if (writer != null) { IOUtils.closeStream(writer); } } if (writeToDistributedCache) { for (FileStatus status : fs.listStatus(new Path(bucketCachePath))) { if (!status.isDirectory()) { DistributedCache.addCacheFile(status.getPath().toUri(), conf); } } } }
public void map( Text key, Text val, org.apache.hadoop.mapreduce.Mapper<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException { int i = 0, n = 0, j = 0, lj = 0, hj = 0; String tem = ""; initStopWordsMap(); // initialize the stop list String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase(), tokenDelimiter); // set delimiter n = itr.countTokens(); cache = new String[n]; for (i = 0; i < n; i++) { cache[i] = new String(""); // initialize the cache } i = 0; while (itr.hasMoreTokens()) { cache[i] = itr.nextToken(); // padding the cache with the words of the content i++; } for (i = 0; i < n; i++) { keyWord = cache[i]; keyWord = keyWord.trim(); if (!hmStopWord.containsKey(keyWord)) { lj = i - 10; hj = i + 10; if (lj < 0) lj = 0; if (hj > n) hj = n; tem = " "; for (j = lj; j < hj; j++) tem += cache[j] + " "; location = new Text(); location.set(key.toString() + tem); context.write(new Text(keyWord), location); } } }
/* * Init of stop words hash map */ public static void initStopWordsMap() { for (int i = 0; i < stopWords.length; i++) hmStopWord.put(stopWords[i], null); }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }
public Text getBucketName(BucketStripped bucketStripped) { Bucket bucket = bucketCache.get(bucketStripped.getCacheHash()); return bucket.getBucketName(); }
public Bucket getBucket(BucketStripped bucketStripped) throws IOException { Bucket bucket = new Bucket(bucketCache.get(bucketStripped.getCacheHash())); bucket.setBucketValue(new Text(bucketStripped.getBucketValue())); bucket.computeHash(); return bucket; }
public void addBucket(Bucket bucket) { int index = BitmapIndex.getBucketCacheIndex(bucket); bucketCache.put(new IntWritable(index), bucket); }