/** For debugging. */ public static void main(String[] args) throws Exception { final String usage = "NutchBean query"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } final Configuration conf = NutchConfiguration.create(); final NutchBean bean = new NutchBean(conf); try { final Query query = Query.parse(args[0], conf); final Hits hits = bean.search(query, 10); System.out.println("Total hits: " + hits.getTotal()); final int length = (int) Math.min(hits.getTotal(), 10); final Hit[] show = hits.getHits(0, length); final HitDetails[] details = bean.getDetails(show); final Summary[] summaries = bean.getSummary(details, query); for (int i = 0; i < hits.getLength(); i++) { System.out.println(" " + i + " " + details[i] + "\n" + summaries[i]); } } catch (Throwable t) { LOG.error("Exception occured while executing search: " + t, t); System.exit(1); } System.exit(0); }
private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if (fs == null) return; for (FileStatus file : fs.listStatus(p)) { Path pfs = file.getPath(); if (file.isDir()) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = null; if (pfs.getName().endsWith(Extensions.JSON)) { JsonParser parser = new JsonParser(); JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject(); JsonElement v = json.get(Constants.VERSION); if (v == null) throw new RuntimeException("Missing version"); JsonElement type = json.get(Constants.TYPE); if (type == null) throw new RuntimeException("Missing type"); Class c = Class.forName(type.getAsString()); OldModel model = (OldModel) c.newInstance(); model.fromJson(json); } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file? FSDataInputStream s = fs.open(pfs); int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg byte[] mem = MemoryManager.malloc1(sz); s.readFully(mem); // Convert to a ValueArray (hope it fits in 1Meg!) ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem)); val = new Value(k, ary, Value.HDFS); } else if (size >= 2 * ValueArray.CHUNK_SZ) { val = new Value( k, new ValueArray(k, size), Value.HDFS); // ValueArray byte wrapper over a large file } else { val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); } DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch (Exception e) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } }
/** * Calculate how many maps to run. Number of maps is bounded by a minimum of the cumulative size * of the copy / (distcp.bytes.per.map, default BYTES_PER_MAP or -m on the command line) and at * most (distcp.max.map.tasks, default MAX_MAPS_PER_NODE * nodes in the cluster). * * @param totalBytes Count of total bytes for job * @param job The job to configure * @return Count of maps to run. */ private static void setMapCount(long totalBytes, JobConf job) throws IOException { int numMaps = (int) (totalBytes / job.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP)); numMaps = Math.min( numMaps, job.getInt( MAX_MAPS_LABEL, MAX_MAPS_PER_NODE * new JobClient(job).getClusterStatus().getTaskTrackers())); job.setNumMapTasks(Math.max(numMaps, 1)); }
/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
/** * A tracker wants to know if there's a Task to run. Returns a task we'd like the TaskTracker to * execute right now. * * <p>Eventually this function should compute load on the various TaskTrackers, and incorporate * knowledge of DFS file placement. But for right now, it just grabs a single item out of the * pending task list and hands it back. */ public synchronized Task pollForNewTask(String taskTracker) { // // Compute average map and reduce task numbers across pool // int avgMaps = 0; int avgReduces = 0; int numTaskTrackers; TaskTrackerStatus tts; synchronized (taskTrackers) { numTaskTrackers = taskTrackers.size(); tts = (TaskTrackerStatus) taskTrackers.get(taskTracker); } if (numTaskTrackers > 0) { avgMaps = totalMaps / numTaskTrackers; avgReduces = totalReduces / numTaskTrackers; } int totalCapacity = numTaskTrackers * maxCurrentTasks; // // Get map + reduce counts for the current tracker. // if (tts == null) { LOG.warning("Unknown task tracker polling; ignoring: " + taskTracker); return null; } int numMaps = tts.countMapTasks(); int numReduces = tts.countReduceTasks(); // // In the below steps, we allocate first a map task (if appropriate), // and then a reduce task if appropriate. We go through all jobs // in order of job arrival; jobs only get serviced if their // predecessors are serviced, too. // // // We hand a task to the current taskTracker if the given machine // has a workload that's equal to or less than the averageMaps // +/- TASK_ALLOC_EPSILON. (That epsilon is in place in case // there is an odd machine that is failing for some reason but // has not yet been removed from the pool, making capacity seem // larger than it really is.) // synchronized (jobsByArrival) { if ((numMaps < maxCurrentTasks) && (numMaps <= (avgMaps + TASK_ALLOC_EPSILON))) { int totalNeededMaps = 0; for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) { JobInProgress job = (JobInProgress) it.next(); if (job.getStatus().getRunState() != JobStatus.RUNNING) { continue; } Task t = job.obtainNewMapTask(taskTracker, tts); if (t != null) { return t; } // // Beyond the highest-priority task, reserve a little // room for failures and speculative executions; don't // schedule tasks to the hilt. // totalNeededMaps += job.desiredMaps(); double padding = 0; if (totalCapacity > MIN_SLOTS_FOR_PADDING) { padding = Math.min(maxCurrentTasks, totalNeededMaps * PAD_FRACTION); } if (totalNeededMaps + padding >= totalCapacity) { break; } } } // // Same thing, but for reduce tasks // if ((numReduces < maxCurrentTasks) && (numReduces <= (avgReduces + TASK_ALLOC_EPSILON))) { int totalNeededReduces = 0; for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) { JobInProgress job = (JobInProgress) it.next(); if (job.getStatus().getRunState() != JobStatus.RUNNING) { continue; } Task t = job.obtainNewReduceTask(taskTracker, tts); if (t != null) { return t; } // // Beyond the highest-priority task, reserve a little // room for failures and speculative executions; don't // schedule tasks to the hilt. // totalNeededReduces += job.desiredReduces(); double padding = 0; if (totalCapacity > MIN_SLOTS_FOR_PADDING) { padding = Math.min(maxCurrentTasks, totalNeededReduces * PAD_FRACTION); } if (totalNeededReduces + padding >= totalCapacity) { break; } } } } return null; }