Beispiel #1
5
  /** Process incoming heartbeat messages from the task trackers. */
  public synchronized int emitHeartbeat(TaskTrackerStatus trackerStatus, boolean initialContact) {
    String trackerName = trackerStatus.getTrackerName();
    trackerStatus.setLastSeen(System.currentTimeMillis());

    synchronized (taskTrackers) {
      synchronized (trackerExpiryQueue) {
        boolean seenBefore = updateTaskTrackerStatus(trackerName, trackerStatus);
        if (initialContact) {
          // If it's first contact, then clear out any state hanging around
          if (seenBefore) {
            lostTaskTracker(trackerName);
          }
        } else {
          // If not first contact, there should be some record of the tracker
          if (!seenBefore) {
            return InterTrackerProtocol.UNKNOWN_TASKTRACKER;
          }
        }

        if (initialContact) {
          trackerExpiryQueue.add(trackerStatus);
        }
      }
    }

    updateTaskStatuses(trackerStatus);
    // LOG.info("Got heartbeat from "+trackerName);
    return InterTrackerProtocol.TRACKERS_OK;
  }
Beispiel #2
2
  void removeTaskEntry(String taskid) {
    // taskid --> tracker
    String tracker = (String) taskidToTrackerMap.remove(taskid);

    // tracker --> taskid
    TreeSet trackerSet = (TreeSet) trackerToTaskMap.get(tracker);
    if (trackerSet != null) {
      trackerSet.remove(taskid);
    }

    // taskid --> TIP
    taskidToTIPMap.remove(taskid);
  }
  private void checkRules() {
    outputMessage("\nChecking " + m_baseUrl, TEST_SUMMARY_MESSAGE);
    outputMessage(
        "crawl depth: " + m_crawlDepth + "     crawl delay: " + m_crawlDelay + " ms.",
        PLAIN_MESSAGE);

    TreeSet crawlList = new TreeSet();
    TreeSet fetched = new TreeSet();
    // inialize with the baseUrl
    crawlList.add(m_baseUrl);
    depth_incl = new int[m_crawlDepth];
    depth_fetched = new int[m_crawlDepth];
    depth_parsed = new int[m_crawlDepth];
    long start_time = TimeBase.nowMs();
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      if (isInterrupted()) {
        return;
      }
      m_curDepth = depth;
      if (crawlList.isEmpty() && depth <= m_crawlDepth) {
        outputMessage("\nNothing left to crawl, exiting after depth " + (depth - 1), PLAIN_MESSAGE);
        break;
      }
      String[] urls = (String[]) crawlList.toArray(new String[0]);
      crawlList.clear();
      outputMessage("\nDepth " + depth, PLAIN_MESSAGE);
      for (int ix = 0; ix < urls.length; ix++) {
        if (isInterrupted()) {
          return;
        }
        pauseBeforeFetch();
        String urlstr = urls[ix];

        m_incls.clear();
        m_excls.clear();

        // crawl the page
        buildUrlSets(urlstr);
        fetched.add(urlstr);
        // output incl/excl results,
        // add the new_incls to the crawlList for next crawl depth loop
        crawlList.addAll(outputUrlResults(urlstr, m_incls, m_excls));
      }
    }
    long elapsed_time = TimeBase.nowMs() - start_time;
    outputSummary(m_baseUrl, fetched, crawlList, elapsed_time);
  }
Beispiel #4
1
  /**
   * We lost the task tracker! All task-tracker structures have already been updated. Just process
   * the contained tasks and any jobs that might be affected.
   */
  void lostTaskTracker(String trackerName) {
    LOG.info("Lost tracker '" + trackerName + "'");
    TreeSet lostTasks = (TreeSet) trackerToTaskMap.get(trackerName);
    trackerToTaskMap.remove(trackerName);

    if (lostTasks != null) {
      for (Iterator it = lostTasks.iterator(); it.hasNext(); ) {
        String taskId = (String) it.next();
        TaskInProgress tip = (TaskInProgress) taskidToTIPMap.get(taskId);

        // Tell the job to fail the relevant task
        JobInProgress job = tip.getJob();
        job.failedTask(tip, taskId, trackerName);
      }
    }
  }
Beispiel #5
0
 /**
  * A tracker wants to know if any of its Tasks have been closed (because the job completed,
  * whether successfully or not)
  */
 public synchronized String pollForTaskWithClosedJob(String taskTracker) {
   TreeSet taskIds = (TreeSet) trackerToTaskMap.get(taskTracker);
   if (taskIds != null) {
     for (Iterator it = taskIds.iterator(); it.hasNext(); ) {
       String taskId = (String) it.next();
       TaskInProgress tip = (TaskInProgress) taskidToTIPMap.get(taskId);
       if (tip.shouldCloseForClosedJob(taskId)) {
         //
         // This is how the JobTracker ends a task at the TaskTracker.
         // It may be successfully completed, or may be killed in
         // mid-execution.
         //
         return taskId;
       }
     }
   }
   return null;
 }
Beispiel #6
0
  ///////////////////////////////////////////////////////
  // Maintain lookup tables; called by JobInProgress
  // and TaskInProgress
  ///////////////////////////////////////////////////////
  void createTaskEntry(String taskid, String taskTracker, TaskInProgress tip) {
    LOG.info(
        "Adding task '"
            + taskid
            + "' to tip "
            + tip.getTIPId()
            + ", for tracker '"
            + taskTracker
            + "'");

    // taskid --> tracker
    taskidToTrackerMap.put(taskid, taskTracker);

    // tracker --> taskid
    TreeSet taskset = (TreeSet) trackerToTaskMap.get(taskTracker);
    if (taskset == null) {
      taskset = new TreeSet();
      trackerToTaskMap.put(taskTracker, taskset);
    }
    taskset.add(taskid);

    // taskid --> TIP
    taskidToTIPMap.put(taskid, tip);
  }
  private Set outputUrlResults(String url, Set m_inclset, Set m_exclset) {
    Set new_incls = new TreeSet(CollectionUtils.subtract(m_inclset, m_reported));
    Set new_excls = new TreeSet(CollectionUtils.subtract(m_exclset, m_reported));
    if (!m_inclset.isEmpty()) {
      outputMessage(
          "\nIncluded Urls: ("
              + new_incls.size()
              + " new, "
              + (m_inclset.size() - new_incls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
      depth_incl[m_curDepth - 1] += new_incls.size();
    }
    for (Iterator it = new_incls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }

    if (!m_exclset.isEmpty()) {
      outputMessage(
          "\nExcluded Urls: ("
              + new_excls.size()
              + " new, "
              + (m_exclset.size() - new_excls.size())
              + " old)",
          URL_SUMMARY_MESSAGE);
    }
    for (Iterator it = new_excls.iterator(); it.hasNext(); ) {
      outputMessage(it.next().toString(), PLAIN_MESSAGE);
    }
    m_reported.addAll(new_incls);
    m_reported.addAll(new_excls);

    if (m_outWriter != null) {
      try {
        m_outWriter.flush();
      } catch (IOException ex) {
      }
    }
    return new_incls;
  }
  private void outputSummary(String baseUrl, Set fetched, Set toCrawl, long elapsedTime) {
    int fetchCount = fetched.size();
    outputMessage(
        "\n\nSummary for starting Url: " + baseUrl + " and depth: " + m_crawlDepth,
        TEST_SUMMARY_MESSAGE);
    outputMessage(
        "\nUrls fetched: " + fetchCount + "    Urls extracted: " + m_extracted.size(),
        PLAIN_MESSAGE);

    outputMessage("\nDepth  Fetched  Parsed  New URLs", PLAIN_MESSAGE);
    for (int depth = 1; depth <= m_crawlDepth; depth++) {
      PrintfFormat pf = new PrintfFormat("%5d  %7d  %6d  %8d");
      Integer[] args =
          new Integer[] {
            new Integer(depth),
            new Integer(depth_fetched[depth - 1]),
            new Integer(depth_parsed[depth - 1]),
            new Integer(depth_incl[depth - 1]),
          };
      String s = pf.sprintf(args);
      outputMessage(s, PLAIN_MESSAGE);
    }

    outputMessage("\nRemaining unfetched: " + toCrawl.size(), PLAIN_MESSAGE);
    if (false) {
      for (Iterator iter = toCrawl.iterator(); iter.hasNext(); ) {
        String url = (String) iter.next();
        outputMessage(url, PLAIN_MESSAGE);
      }
    }
    long secs = elapsedTime / Constants.SECOND;
    long fetchRate = 0;
    if (secs > 0) {
      fetchRate = fetchCount * 60 * Constants.SECOND / elapsedTime;
    }
    outputMessage(
        "\nElapsed Time: " + secs + " secs." + "    Fetch Rate: " + fetchRate + " p/m",
        PLAIN_MESSAGE);
  }