Exemplo n.º 1
1
  /**
   * We lost the task tracker! All task-tracker structures have already been updated. Just process
   * the contained tasks and any jobs that might be affected.
   */
  void lostTaskTracker(String trackerName) {
    LOG.info("Lost tracker '" + trackerName + "'");
    TreeSet lostTasks = (TreeSet) trackerToTaskMap.get(trackerName);
    trackerToTaskMap.remove(trackerName);

    if (lostTasks != null) {
      for (Iterator it = lostTasks.iterator(); it.hasNext(); ) {
        String taskId = (String) it.next();
        TaskInProgress tip = (TaskInProgress) taskidToTIPMap.get(taskId);

        // Tell the job to fail the relevant task
        JobInProgress job = tip.getJob();
        job.failedTask(tip, taskId, trackerName);
      }
    }
  }
Exemplo n.º 2
0
    /**
     * The run method lives for the life of the JobTracker, and removes Jobs that are not still
     * running, but which finished a long time ago.
     */
    public void run() {
      while (shouldRun) {
        try {
          Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
        } catch (InterruptedException ie) {
        }

        synchronized (jobs) {
          synchronized (jobInitQueue) {
            synchronized (jobsByArrival) {
              for (Iterator it = jobs.keySet().iterator(); it.hasNext(); ) {
                String jobid = (String) it.next();
                JobInProgress job = (JobInProgress) jobs.get(jobid);

                if (job.getStatus().getRunState() != JobStatus.RUNNING
                    && job.getStatus().getRunState() != JobStatus.PREP
                    && (job.getFinishTime() + RETIRE_JOB_INTERVAL < System.currentTimeMillis())) {
                  it.remove();

                  jobInitQueue.remove(job);
                  jobsByArrival.remove(job);
                }
              }
            }
          }
        }
      }
    }
Exemplo n.º 3
0
    public void reduce(
        Text key,
        Iterator<CrawlDatum> values,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      boolean oldSet = false;
      boolean injectedSet = false;
      while (values.hasNext()) {
        CrawlDatum val = values.next();
        if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {
          injected.set(val);
          injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
          injectedSet = true;
        } else {
          old.set(val);
          oldSet = true;
        }
      }
      CrawlDatum res = null;

      /**
       * Whether to overwrite, ignore or update existing records
       *
       * @see https://issues.apache.org/jira/browse/NUTCH-1405
       */

      // Injected record already exists and overwrite but not update
      if (injectedSet && oldSet && overwrite) {
        res = injected;

        if (update) {
          LOG.info(key.toString() + " overwritten with injected record but update was specified.");
        }
      }

      // Injected record already exists and update but not overwrite
      if (injectedSet && oldSet && update && !overwrite) {
        res = old;
        old.putAllMetaData(injected);
        old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
        old.setFetchInterval(
            injected.getFetchInterval() != interval
                ? injected.getFetchInterval()
                : old.getFetchInterval());
      }

      // Old default behaviour
      if (injectedSet && !oldSet) {
        res = injected;
      } else {
        res = old;
      }

      output.collect(key, res);
    }
 /** Return a table of block data */
 public Block[] getBlockReport() {
   TreeSet<Block> blockSet = new TreeSet<Block>();
   volumes.getBlockInfo(blockSet);
   Block blockTable[] = new Block[blockSet.size()];
   int i = 0;
   for (Iterator<Block> it = blockSet.iterator(); it.hasNext(); i++) {
     blockTable[i] = it.next();
   }
   return blockTable;
 }
Exemplo n.º 5
0
 public synchronized TaskReport[] getReduceTaskReports(String jobid) {
   JobInProgress job = (JobInProgress) jobs.get(jobid);
   if (job == null) {
     return new TaskReport[0];
   } else {
     Vector reports = new Vector();
     Vector completeReduceTasks = job.reportTasksInProgress(false, true);
     for (Iterator it = completeReduceTasks.iterator(); it.hasNext(); ) {
       TaskInProgress tip = (TaskInProgress) it.next();
       reports.add(tip.generateSingleReport());
     }
     Vector incompleteReduceTasks = job.reportTasksInProgress(false, false);
     for (Iterator it = incompleteReduceTasks.iterator(); it.hasNext(); ) {
       TaskInProgress tip = (TaskInProgress) it.next();
       reports.add(tip.generateSingleReport());
     }
     return (TaskReport[]) reports.toArray(new TaskReport[reports.size()]);
   }
 }
Exemplo n.º 6
0
 public Vector completedJobs() {
   Vector v = new Vector();
   for (Iterator it = jobs.values().iterator(); it.hasNext(); ) {
     JobInProgress jip = (JobInProgress) it.next();
     JobStatus status = jip.getStatus();
     if (status.getRunState() == JobStatus.SUCCEEDED) {
       v.add(jip);
     }
   }
   return v;
 }
Exemplo n.º 7
0
 public Vector runningJobs() {
   Vector v = new Vector();
   for (Iterator it = jobs.values().iterator(); it.hasNext(); ) {
     JobInProgress jip = (JobInProgress) it.next();
     JobStatus status = jip.getStatus();
     if (status.getRunState() == JobStatus.RUNNING) {
       v.add(jip);
     }
   }
   return v;
 }
Exemplo n.º 8
0
 public void reduce(
     Text key,
     Iterator<IntWritable> values,
     OutputCollector<Text, IntWritable> output,
     Reporter reporter)
     throws IOException {
   int sum = 0;
   while (values.hasNext()) {
     sum += values.next().get();
   }
   output.collect(key, new IntWritable(sum));
 }
Exemplo n.º 9
0
  /** Start the JobTracker process, listen on the indicated port */
  JobTracker(Configuration conf) throws IOException {
    //
    // Grab some static constants
    //
    maxCurrentTasks = conf.getInt("mapred.tasktracker.tasks.maximum", 2);
    RETIRE_JOB_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.interval", 24 * 60 * 60 * 1000);
    RETIRE_JOB_CHECK_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.check", 60 * 1000);
    TASK_ALLOC_EPSILON = conf.getFloat("mapred.jobtracker.taskalloc.loadbalance.epsilon", 0.2f);
    PAD_FRACTION = conf.getFloat("mapred.jobtracker.taskalloc.capacitypad", 0.1f);
    MIN_SLOTS_FOR_PADDING = 3 * maxCurrentTasks;

    // This is a directory of temporary submission files.  We delete it
    // on startup, and can delete any files that we're done with
    this.conf = conf;
    JobConf jobConf = new JobConf(conf);
    this.systemDir = jobConf.getSystemDir();
    this.fs = FileSystem.get(conf);
    FileUtil.fullyDelete(fs, systemDir);
    fs.mkdirs(systemDir);

    // Same with 'localDir' except it's always on the local disk.
    jobConf.deleteLocalFiles(SUBDIR);

    // Set ports, start RPC servers, etc.
    InetSocketAddress addr = getAddress(conf);
    this.localMachine = addr.getHostName();
    this.port = addr.getPort();
    this.interTrackerServer = RPC.getServer(this, addr.getPort(), 10, false, conf);
    this.interTrackerServer.start();
    Properties p = System.getProperties();
    for (Iterator it = p.keySet().iterator(); it.hasNext(); ) {
      String key = (String) it.next();
      String val = (String) p.getProperty(key);
      LOG.info("Property '" + key + "' is " + val);
    }

    this.infoPort = conf.getInt("mapred.job.tracker.info.port", 50030);
    this.infoServer = new JobTrackerInfoServer(this, infoPort);
    this.infoServer.start();

    this.startTime = System.currentTimeMillis();

    new Thread(this.expireTrackers).start();
    new Thread(this.retireJobs).start();
    new Thread(this.initJobs).start();
  }
Exemplo n.º 10
0
 /**
  * A tracker wants to know if any of its Tasks have been closed (because the job completed,
  * whether successfully or not)
  */
 public synchronized String pollForTaskWithClosedJob(String taskTracker) {
   TreeSet taskIds = (TreeSet) trackerToTaskMap.get(taskTracker);
   if (taskIds != null) {
     for (Iterator it = taskIds.iterator(); it.hasNext(); ) {
       String taskId = (String) it.next();
       TaskInProgress tip = (TaskInProgress) taskidToTIPMap.get(taskId);
       if (tip.shouldCloseForClosedJob(taskId)) {
         //
         // This is how the JobTracker ends a task at the TaskTracker.
         // It may be successfully completed, or may be killed in
         // mid-execution.
         //
         return taskId;
       }
     }
   }
   return null;
 }
Exemplo n.º 11
0
  /**
   * Accept and process a new TaskTracker profile. We might have known about the TaskTracker
   * previously, or it might be brand-new. All task-tracker structures have already been updated.
   * Just process the contained tasks and any jobs that might be affected.
   */
  void updateTaskStatuses(TaskTrackerStatus status) {
    for (Iterator it = status.taskReports(); it.hasNext(); ) {
      TaskStatus report = (TaskStatus) it.next();
      TaskInProgress tip = (TaskInProgress) taskidToTIPMap.get(report.getTaskId());
      if (tip == null) {
        LOG.info(
            "Serious problem.  While updating status, cannot find taskid " + report.getTaskId());
      } else {
        JobInProgress job = tip.getJob();
        job.updateTaskStatus(tip, report);

        if (report.getRunState() == TaskStatus.SUCCEEDED) {
          job.completedTask(tip, report.getTaskId());
        } else if (report.getRunState() == TaskStatus.FAILED) {
          // Tell the job to fail the relevant task
          job.failedTask(tip, report.getTaskId(), status.getTrackerName());
        }
      }
    }
  }
Exemplo n.º 12
0
 private static void deleteCache(Configuration conf, MRAsyncDiskService asyncDiskService)
     throws IOException {
   List<CacheStatus> deleteSet = new LinkedList<CacheStatus>();
   // try deleting cache Status with refcount of zero
   synchronized (cachedArchives) {
     for (Iterator<String> it = cachedArchives.keySet().iterator(); it.hasNext(); ) {
       String cacheId = (String) it.next();
       CacheStatus lcacheStatus = cachedArchives.get(cacheId);
       if (lcacheStatus.refcount == 0) {
         // delete this cache entry from the global list
         // and mark the localized file for deletion
         deleteSet.add(lcacheStatus);
         it.remove();
       }
     }
   }
   // do the deletion asynchronously, after releasing the global lock
   Thread cacheFileCleaner =
       new Thread(new CacheFileCleanTask(asyncDiskService, FileSystem.getLocal(conf), deleteSet));
   cacheFileCleaner.start();
 }
Exemplo n.º 13
0
    public void reduce(
        IntWritable key,
        Iterator<Text> values,
        OutputCollector<IntWritable, Text> output,
        Reporter reporter)
        throws IOException {
      HashMap<String, Integer> countries_map = new HashMap<String, Integer>();
      ArrayList<Integer> counties = new ArrayList<>();
      String cp = new String();

      while (values.hasNext()) {
        cp = values.next().toString();
        if (countries_map.containsKey(cp)) {
          countries_map.put(cp, countries_map.get(cp) + 1);
        } else {
          countries_map.put(cp, 1);
        }
      }

      for (java.util.Map.Entry<String, Integer> entry : countries_map.entrySet()) {
        counties.add(entry.getValue());
      }
      output.collect(
          key,
          new Text(
              ""
                  + countries_map.entrySet().size()
                  + " "
                  + Collections.min(counties)
                  + " "
                  + median(counties)
                  + " "
                  + Collections.max(counties)
                  + " "
                  + mean(counties)
                  + " "
                  + standard_deviation(counties)));
    }
Exemplo n.º 14
0
  /**
   * Try to update an old block to a new block. If there are ongoing create threads running for the
   * old block, the threads will be returned without updating the block.
   *
   * @return ongoing create threads if there is any. Otherwise, return null.
   */
  private synchronized List<Thread> tryUpdateBlock(Block oldblock, Block newblock)
      throws IOException {
    // check ongoing create threads
    final ActiveFile activefile = ongoingCreates.get(oldblock);
    if (activefile != null && !activefile.threads.isEmpty()) {
      // remove dead threads
      for (Iterator<Thread> i = activefile.threads.iterator(); i.hasNext(); ) {
        final Thread t = i.next();
        if (!t.isAlive()) {
          i.remove();
        }
      }

      // return living threads
      if (!activefile.threads.isEmpty()) {
        return new ArrayList<Thread>(activefile.threads);
      }
    }

    // No ongoing create threads is alive. Update block.
    File blockFile = findBlockFile(oldblock.getBlockId());
    if (blockFile == null) {
      throw new IOException("Block " + oldblock + " does not exist.");
    }

    File oldMetaFile = findMetaFile(blockFile);
    long oldgs = parseGenerationStamp(blockFile, oldMetaFile);

    // rename meta file to a tmp file
    File tmpMetaFile =
        new File(
            oldMetaFile.getParent(),
            oldMetaFile.getName() + "_tmp" + newblock.getGenerationStamp());
    if (!oldMetaFile.renameTo(tmpMetaFile)) {
      throw new IOException("Cannot rename block meta file to " + tmpMetaFile);
    }

    // update generation stamp
    if (oldgs > newblock.getGenerationStamp()) {
      throw new IOException(
          "Cannot update block (id="
              + newblock.getBlockId()
              + ") generation stamp from "
              + oldgs
              + " to "
              + newblock.getGenerationStamp());
    }

    // update length
    if (newblock.getNumBytes() > oldblock.getNumBytes()) {
      throw new IOException(
          "Cannot update block file (="
              + blockFile
              + ") length from "
              + oldblock.getNumBytes()
              + " to "
              + newblock.getNumBytes());
    }
    if (newblock.getNumBytes() < oldblock.getNumBytes()) {
      truncateBlock(blockFile, tmpMetaFile, oldblock.getNumBytes(), newblock.getNumBytes());
    }

    // rename the tmp file to the new meta file (with new generation stamp)
    File newMetaFile = getMetaFile(blockFile, newblock);
    if (!tmpMetaFile.renameTo(newMetaFile)) {
      throw new IOException("Cannot rename tmp meta file to " + newMetaFile);
    }

    updateBlockMap(ongoingCreates, oldblock, newblock);
    updateBlockMap(volumeMap, oldblock, newblock);

    // paranoia! verify that the contents of the stored block
    // matches the block file on disk.
    validateBlockMetadata(newblock);
    return null;
  }
Exemplo n.º 15
0
  /**
   * A tracker wants to know if there's a Task to run. Returns a task we'd like the TaskTracker to
   * execute right now.
   *
   * <p>Eventually this function should compute load on the various TaskTrackers, and incorporate
   * knowledge of DFS file placement. But for right now, it just grabs a single item out of the
   * pending task list and hands it back.
   */
  public synchronized Task pollForNewTask(String taskTracker) {
    //
    // Compute average map and reduce task numbers across pool
    //
    int avgMaps = 0;
    int avgReduces = 0;
    int numTaskTrackers;
    TaskTrackerStatus tts;
    synchronized (taskTrackers) {
      numTaskTrackers = taskTrackers.size();
      tts = (TaskTrackerStatus) taskTrackers.get(taskTracker);
    }
    if (numTaskTrackers > 0) {
      avgMaps = totalMaps / numTaskTrackers;
      avgReduces = totalReduces / numTaskTrackers;
    }
    int totalCapacity = numTaskTrackers * maxCurrentTasks;
    //
    // Get map + reduce counts for the current tracker.
    //
    if (tts == null) {
      LOG.warning("Unknown task tracker polling; ignoring: " + taskTracker);
      return null;
    }

    int numMaps = tts.countMapTasks();
    int numReduces = tts.countReduceTasks();

    //
    // In the below steps, we allocate first a map task (if appropriate),
    // and then a reduce task if appropriate.  We go through all jobs
    // in order of job arrival; jobs only get serviced if their
    // predecessors are serviced, too.
    //

    //
    // We hand a task to the current taskTracker if the given machine
    // has a workload that's equal to or less than the averageMaps
    // +/- TASK_ALLOC_EPSILON.  (That epsilon is in place in case
    // there is an odd machine that is failing for some reason but
    // has not yet been removed from the pool, making capacity seem
    // larger than it really is.)
    //
    synchronized (jobsByArrival) {
      if ((numMaps < maxCurrentTasks) && (numMaps <= (avgMaps + TASK_ALLOC_EPSILON))) {

        int totalNeededMaps = 0;
        for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) {
          JobInProgress job = (JobInProgress) it.next();
          if (job.getStatus().getRunState() != JobStatus.RUNNING) {
            continue;
          }

          Task t = job.obtainNewMapTask(taskTracker, tts);
          if (t != null) {
            return t;
          }

          //
          // Beyond the highest-priority task, reserve a little
          // room for failures and speculative executions; don't
          // schedule tasks to the hilt.
          //
          totalNeededMaps += job.desiredMaps();
          double padding = 0;
          if (totalCapacity > MIN_SLOTS_FOR_PADDING) {
            padding = Math.min(maxCurrentTasks, totalNeededMaps * PAD_FRACTION);
          }
          if (totalNeededMaps + padding >= totalCapacity) {
            break;
          }
        }
      }

      //
      // Same thing, but for reduce tasks
      //
      if ((numReduces < maxCurrentTasks) && (numReduces <= (avgReduces + TASK_ALLOC_EPSILON))) {

        int totalNeededReduces = 0;
        for (Iterator it = jobsByArrival.iterator(); it.hasNext(); ) {
          JobInProgress job = (JobInProgress) it.next();
          if (job.getStatus().getRunState() != JobStatus.RUNNING) {
            continue;
          }

          Task t = job.obtainNewReduceTask(taskTracker, tts);
          if (t != null) {
            return t;
          }

          //
          // Beyond the highest-priority task, reserve a little
          // room for failures and speculative executions; don't
          // schedule tasks to the hilt.
          //
          totalNeededReduces += job.desiredReduces();
          double padding = 0;
          if (totalCapacity > MIN_SLOTS_FOR_PADDING) {
            padding = Math.min(maxCurrentTasks, totalNeededReduces * PAD_FRACTION);
          }
          if (totalNeededReduces + padding >= totalCapacity) {
            break;
          }
        }
      }
    }
    return null;
  }
Exemplo n.º 16
0
    public void map(
        WritableComparable<?> key,
        Text value,
        OutputCollector<Text, CrawlDatum> output,
        Reporter reporter)
        throws IOException {
      String url = value.toString(); // value is line of text

      if (url != null && url.trim().startsWith("#")) {
        /* Ignore line that start with # */
        return;
      }

      // if tabs : metadata that could be stored
      // must be name=value and separated by \t
      float customScore = -1f;
      int customInterval = interval;
      int fixedInterval = -1;
      Map<String, String> metadata = new TreeMap<String, String>();
      if (url.indexOf("\t") != -1) {
        String[] splits = url.split("\t");
        url = splits[0];
        for (int s = 1; s < splits.length; s++) {
          // find separation between name and value
          int indexEquals = splits[s].indexOf("=");
          if (indexEquals == -1) {
            // skip anything without a =
            continue;
          }
          String metaname = splits[s].substring(0, indexEquals);
          String metavalue = splits[s].substring(indexEquals + 1);
          if (metaname.equals(nutchScoreMDName)) {
            try {
              customScore = Float.parseFloat(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFetchIntervalMDName)) {
            try {
              customInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else if (metaname.equals(nutchFixedFetchIntervalMDName)) {
            try {
              fixedInterval = Integer.parseInt(metavalue);
            } catch (NumberFormatException nfe) {
            }
          } else metadata.put(metaname, metavalue);
        }
      }
      try {
        url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
        url = filters.filter(url); // filter the url
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Skipping " + url + ":" + e);
        }
        url = null;
      }
      if (url == null) {
        reporter.getCounter("injector", "urls_filtered").increment(1);
      } else { // if it passes
        value.set(url); // collect it
        CrawlDatum datum = new CrawlDatum();
        datum.setStatus(CrawlDatum.STATUS_INJECTED);

        // Is interval custom? Then set as meta data
        if (fixedInterval > -1) {
          // Set writable using float. Flaot is used by AdaptiveFetchSchedule
          datum
              .getMetaData()
              .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval));
          datum.setFetchInterval(fixedInterval);
        } else {
          datum.setFetchInterval(customInterval);
        }

        datum.setFetchTime(curTime);
        // now add the metadata
        Iterator<String> keysIter = metadata.keySet().iterator();
        while (keysIter.hasNext()) {
          String keymd = keysIter.next();
          String valuemd = metadata.get(keymd);
          datum.getMetaData().put(new Text(keymd), new Text(valuemd));
        }
        if (customScore != -1) datum.setScore(customScore);
        else datum.setScore(scoreInjected);
        try {
          scfilters.injectedScore(value, datum);
        } catch (ScoringFilterException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn(
                "Cannot filter injected score for url "
                    + url
                    + ", using default ("
                    + e.getMessage()
                    + ")");
          }
        }
        reporter.getCounter("injector", "urls_injected").increment(1);
        output.collect(value, datum);
      }
    }