Exemple #1
5
  /** Process incoming heartbeat messages from the task trackers. */
  public synchronized int emitHeartbeat(TaskTrackerStatus trackerStatus, boolean initialContact) {
    String trackerName = trackerStatus.getTrackerName();
    trackerStatus.setLastSeen(System.currentTimeMillis());

    synchronized (taskTrackers) {
      synchronized (trackerExpiryQueue) {
        boolean seenBefore = updateTaskTrackerStatus(trackerName, trackerStatus);
        if (initialContact) {
          // If it's first contact, then clear out any state hanging around
          if (seenBefore) {
            lostTaskTracker(trackerName);
          }
        } else {
          // If not first contact, there should be some record of the tracker
          if (!seenBefore) {
            return InterTrackerProtocol.UNKNOWN_TASKTRACKER;
          }
        }

        if (initialContact) {
          trackerExpiryQueue.add(trackerStatus);
        }
      }
    }

    updateTaskStatuses(trackerStatus);
    // LOG.info("Got heartbeat from "+trackerName);
    return InterTrackerProtocol.TRACKERS_OK;
  }
 @Override
 public int run(String[] args) throws Exception {
   if (args.length < 4) {
     writeUsage();
     return 1;
   }
   Path secretsPath = new Path(args[0]);
   Path saltFilePath = new Path(args[1]);
   Path inputPath = new Path(args[2]);
   Path outputPath = new Path(args[3]);
   // Make sure the salt file exists
   generateSaltIfNeeded(saltFilePath, secretsPath);
   // Configure the job
   Job job = configureJob(secretsPath, saltFilePath, inputPath, outputPath);
   // Run it
   long startTime = System.currentTimeMillis();
   job.submit();
   if (job.waitForCompletion(true)) {
     System.out.printf(
         "Done obfuscating - took %d seconds.\n", (System.currentTimeMillis() - startTime) / 1000);
   } else {
     System.err.printf("Job finished with errors: %s\n", job.getStatus().getFailureInfo());
     return 2;
   }
   return 0;
 }
Exemple #3
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Exemple #4
0
 private static void run(Callable c, boolean read, int size) {
   // Count all i/o time from here, including all retry overheads
   long start_io_ms = System.currentTimeMillis();
   while (true) {
     try {
       long start_ns = System.nanoTime(); // Blocking i/o call timing - without counting repeats
       c.call();
       TimeLine.record_IOclose(start_ns, start_io_ms, read ? 1 : 0, size, Value.HDFS);
       break;
       // Explicitly ignore the following exceptions but
       // fail on the rest IOExceptions
     } catch (EOFException e) {
       ignoreAndWait(e, false);
     } catch (SocketTimeoutException e) {
       ignoreAndWait(e, false);
     } catch (S3Exception e) {
       // Preserve S3Exception before IOException
       // Since this is tricky code - we are supporting different HDFS version
       // New version declares S3Exception as IOException
       // But old versions (0.20.xxx) declares it as RuntimeException
       // So we have to catch it before IOException !!!
       ignoreAndWait(e, false);
     } catch (IOException e) {
       ignoreAndWait(e, true);
     } catch (Exception e) {
       throw Log.errRTExcept(e);
     }
   }
 }
Exemple #5
0
  /** Implements basic throttling capabilities. */
  public static class Throttler {

    double bytesPerSec;
    long lastTime = System.currentTimeMillis();

    public Throttler(double bytesPerSec) {
      this.bytesPerSec = bytesPerSec;
    }

    public void incrementAndThrottle(int bytes) {
      if (bytesPerSec < 1) { // no throttle at all
        return;
      }
      long currentTime = System.currentTimeMillis();
      long timeDiff = currentTime - lastTime;
      if (timeDiff == 0) {
        timeDiff = 1;
      }

      double bytesPerSec = (bytes / (double) timeDiff) * 1000;
      if (bytesPerSec > this.bytesPerSec) {
        // Throttle
        double exceededByFactorOf = bytesPerSec / this.bytesPerSec;
        try {
          long mustSleep = (long) ((exceededByFactorOf - 1) * timeDiff);
          Thread.sleep(mustSleep);
        } catch (InterruptedException e) {
          e.printStackTrace();
        }
      }

      lastTime = System.currentTimeMillis();
    }
  }
Exemple #6
0
    /**
     * The run method lives for the life of the JobTracker, and removes Jobs that are not still
     * running, but which finished a long time ago.
     */
    public void run() {
      while (shouldRun) {
        try {
          Thread.sleep(RETIRE_JOB_CHECK_INTERVAL);
        } catch (InterruptedException ie) {
        }

        synchronized (jobs) {
          synchronized (jobInitQueue) {
            synchronized (jobsByArrival) {
              for (Iterator it = jobs.keySet().iterator(); it.hasNext(); ) {
                String jobid = (String) it.next();
                JobInProgress job = (JobInProgress) jobs.get(jobid);

                if (job.getStatus().getRunState() != JobStatus.RUNNING
                    && job.getStatus().getRunState() != JobStatus.PREP
                    && (job.getFinishTime() + RETIRE_JOB_INTERVAL < System.currentTimeMillis())) {
                  it.remove();

                  jobInitQueue.remove(job);
                  jobsByArrival.remove(job);
                }
              }
            }
          }
        }
      }
    }
Exemple #7
0
 public void configure(JobConf job) {
   this.jobConf = job;
   urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_INJECT);
   interval = jobConf.getInt("db.fetch.interval.default", 2592000);
   filters = new URLFilters(jobConf);
   scfilters = new ScoringFilters(jobConf);
   scoreInjected = jobConf.getFloat("db.score.injected", 1.0f);
   curTime = job.getLong("injector.current.time", System.currentTimeMillis());
 }
 @Override
 public int doWork() {
   try {
     setupProcedureStore();
     ExecutorService executor = Executors.newFixedThreadPool(numThreads);
     Future<?>[] futures = new Future<?>[numThreads];
     // Start worker threads.
     long start = System.currentTimeMillis();
     for (int i = 0; i < numThreads; i++) {
       futures[i] = executor.submit(this.new Worker(start));
     }
     boolean failure = false;
     try {
       for (Future<?> future : futures) {
         long timeout = start + WORKER_THREADS_TIMEOUT_SEC * 1000 - System.currentTimeMillis();
         failure |= (future.get(timeout, TimeUnit.MILLISECONDS).equals(EXIT_FAILURE));
       }
     } catch (Exception e) {
       System.err.println("Exception in worker thread.");
       e.printStackTrace();
       return EXIT_FAILURE;
     }
     executor.shutdown();
     if (failure) {
       return EXIT_FAILURE;
     }
     long timeTaken = System.currentTimeMillis() - start;
     System.out.println("******************************************");
     System.out.println("Num threads    : " + numThreads);
     System.out.println("Num procedures : " + numProcs);
     System.out.println("Sync type      : " + syncType);
     System.out.println("Time taken     : " + (timeTaken / 1000.0f) + "sec");
     System.out.println("******************************************");
     return EXIT_SUCCESS;
   } catch (IOException e) {
     e.printStackTrace();
     return EXIT_FAILURE;
   } finally {
     tearDownProcedureStore();
   }
 }
Exemple #9
0
    /**
     * The run method lives for the life of the JobTracker, and removes TaskTrackers that have not
     * checked in for some time.
     */
    public void run() {
      while (shouldRun) {
        //
        // Thread runs periodically to check whether trackers should be expired.
        // The sleep interval must be no more than half the maximum expiry time
        // for a task tracker.
        //
        try {
          Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL / 3);
        } catch (InterruptedException ie) {
        }

        //
        // Loop through all expired items in the queue
        //
        synchronized (taskTrackers) {
          synchronized (trackerExpiryQueue) {
            long now = System.currentTimeMillis();
            TaskTrackerStatus leastRecent = null;
            while ((trackerExpiryQueue.size() > 0)
                && ((leastRecent = (TaskTrackerStatus) trackerExpiryQueue.first()) != null)
                && (now - leastRecent.getLastSeen() > TASKTRACKER_EXPIRY_INTERVAL)) {

              // Remove profile from head of queue
              trackerExpiryQueue.remove(leastRecent);
              String trackerName = leastRecent.getTrackerName();

              // Figure out if last-seen time should be updated, or if tracker is dead
              TaskTrackerStatus newProfile =
                  (TaskTrackerStatus) taskTrackers.get(leastRecent.getTrackerName());
              // Items might leave the taskTracker set through other means; the
              // status stored in 'taskTrackers' might be null, which means the
              // tracker has already been destroyed.
              if (newProfile != null) {
                if (now - newProfile.getLastSeen() > TASKTRACKER_EXPIRY_INTERVAL) {
                  // Remove completely
                  updateTaskTrackerStatus(trackerName, null);
                  lostTaskTracker(leastRecent.getTrackerName());
                } else {
                  // Update time by inserting latest profile
                  trackerExpiryQueue.add(newProfile);
                }
              }
            }
          }
        }
      }
    }
Exemple #10
0
  /** Start the JobTracker process, listen on the indicated port */
  JobTracker(Configuration conf) throws IOException {
    //
    // Grab some static constants
    //
    maxCurrentTasks = conf.getInt("mapred.tasktracker.tasks.maximum", 2);
    RETIRE_JOB_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.interval", 24 * 60 * 60 * 1000);
    RETIRE_JOB_CHECK_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.check", 60 * 1000);
    TASK_ALLOC_EPSILON = conf.getFloat("mapred.jobtracker.taskalloc.loadbalance.epsilon", 0.2f);
    PAD_FRACTION = conf.getFloat("mapred.jobtracker.taskalloc.capacitypad", 0.1f);
    MIN_SLOTS_FOR_PADDING = 3 * maxCurrentTasks;

    // This is a directory of temporary submission files.  We delete it
    // on startup, and can delete any files that we're done with
    this.conf = conf;
    JobConf jobConf = new JobConf(conf);
    this.systemDir = jobConf.getSystemDir();
    this.fs = FileSystem.get(conf);
    FileUtil.fullyDelete(fs, systemDir);
    fs.mkdirs(systemDir);

    // Same with 'localDir' except it's always on the local disk.
    jobConf.deleteLocalFiles(SUBDIR);

    // Set ports, start RPC servers, etc.
    InetSocketAddress addr = getAddress(conf);
    this.localMachine = addr.getHostName();
    this.port = addr.getPort();
    this.interTrackerServer = RPC.getServer(this, addr.getPort(), 10, false, conf);
    this.interTrackerServer.start();
    Properties p = System.getProperties();
    for (Iterator it = p.keySet().iterator(); it.hasNext(); ) {
      String key = (String) it.next();
      String val = (String) p.getProperty(key);
      LOG.info("Property '" + key + "' is " + val);
    }

    this.infoPort = conf.getInt("mapred.job.tracker.info.port", 50030);
    this.infoServer = new JobTrackerInfoServer(this, infoPort);
    this.infoServer.start();

    this.startTime = System.currentTimeMillis();

    new Thread(this.expireTrackers).start();
    new Thread(this.retireJobs).start();
    new Thread(this.initJobs).start();
  }
Exemple #11
0
 private static void run(Callable c, boolean read, int size) {
   // Count all i/o time from here, including all retry overheads
   long start_io_ms = System.currentTimeMillis();
   while (true) {
     try {
       long start_ns = System.nanoTime(); // Blocking i/o call timing - without counting repeats
       c.call();
       TimeLine.record_IOclose(start_ns, start_io_ms, read ? 1 : 0, size, Value.HDFS);
       break;
       // Explicitly ignore the following exceptions but
       // fail on the rest IOExceptions
     } catch (EOFException e) {
       ignoreAndWait(e, false);
     } catch (SocketTimeoutException e) {
       ignoreAndWait(e, false);
     } catch (IOException e) {
       ignoreAndWait(e, true);
     } catch (Exception e) {
       throw Log.errRTExcept(e);
     }
   }
 }
 // TODO: Can also collect #procs, time taken by each thread to measure fairness.
 @Override
 public Integer call() throws IOException {
   while (true) {
     if (workersFailed.get()) {
       return EXIT_FAILURE;
     }
     long procId = procIds.getAndIncrement();
     if (procId >= numProcs) {
       break;
     }
     if (procId != 0 && procId % 10000 == 0) {
       long ms = System.currentTimeMillis() - start;
       System.out.println("Wrote " + procId + " procedures in " + StringUtils.humanTimeDiff(ms));
     }
     try {
       if (procId > 0 && procId % numProcsPerWal == 0) {
         store.rollWriterForTesting();
         System.out.println(
             "Starting new log : "
                 + store.getActiveLogs().get(store.getActiveLogs().size() - 1));
       }
     } catch (IOException ioe) {
       // Ask other threads to quit too.
       workersFailed.set(true);
       System.err.println("Exception when rolling log file. Current procId = " + procId);
       ioe.printStackTrace();
       return EXIT_FAILURE;
     }
     ProcedureTestingUtility.TestProcedure proc =
         new ProcedureTestingUtility.TestProcedure(procId);
     proc.setData(serializedState);
     store.insert(proc, null);
     store.update(proc);
   }
   return EXIT_SUCCESS;
 }