public void configure(JobConf conf) {
    numberOfCenters = Integer.valueOf(conf.get("numberOfCenters"));
    centersDirectory = conf.get("centersReadDirectory");

    try {
      Configuration c = new Configuration();
      FileSystem fs = FileSystem.get(c);

      for (int index = 0; index < numberOfCenters; ++index) {
        SequenceFile.Reader reader =
            new SequenceFile.Reader(fs, new Path(centersDirectory + "/centers/" + index), c);

        LongWritable key = new LongWritable();
        Point value = new Point();

        reader.next(key, value);

        Point center = (Point) value;

        centers.add(center);

        reader.close();
      }
    } catch (IOException e) {
      // do nothing
      // I hope this doesn't happen
      System.out.println("well, damn.");
      e.printStackTrace();
    }
  }
Example #2
0
  public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: starting at " + sdf.format(start));
      LOG.info("Injector: crawlDb: " + crawlDb);
      LOG.info("Injector: urlDir: " + urlDir);
    }

    Path tempDir =
        new Path(
            getConf().get("mapred.temp.dir", ".")
                + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    RunningJob mapJob = JobClient.runJob(sortJob);

    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
    LOG.info(
        "Injector: total number of urls injected after normalization and filtering: "
            + urlsInjected);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
      LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info(
        "Injector: finished at "
            + sdf.format(end)
            + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
  }
Example #3
0
    protected void setup(Context context) throws IOException, InterruptedException {
      super.setup(context);

      Path pt = new Path("/user/yao/query/query");
      FileSystem fs = FileSystem.get(new Configuration());
      BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt)));
      String line = br.readLine();
      String[] keywords = line.split(",");
      k0 = keywords[0];
      k1 = keywords[1];
      k2 = keywords[2];
      br.close();
    }
Example #4
0
 /*
  * Returns the relative path of the dir this cache will be localized in
  * relative path that this cache will be localized in. For
  * hdfs://hostname:port/absolute_path -- the relative path is
  * hostname/absolute path -- if it is just /absolute_path -- then the
  * relative path is hostname of DFS this mapred cluster is running
  * on/absolute_path
  */
 public static String makeRelative(URI cache, Configuration conf) throws IOException {
   String host = cache.getHost();
   if (host == null) {
     host = cache.getScheme();
   }
   if (host == null) {
     URI defaultUri = FileSystem.get(conf).getUri();
     host = defaultUri.getHost();
     if (host == null) {
       host = defaultUri.getScheme();
     }
   }
   String path = host + cache.getPath();
   path = path.replace(":/", "/"); // remove windows device colon
   return path;
 }
Example #5
0
  /** Start the JobTracker process, listen on the indicated port */
  JobTracker(Configuration conf) throws IOException {
    //
    // Grab some static constants
    //
    maxCurrentTasks = conf.getInt("mapred.tasktracker.tasks.maximum", 2);
    RETIRE_JOB_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.interval", 24 * 60 * 60 * 1000);
    RETIRE_JOB_CHECK_INTERVAL = conf.getLong("mapred.jobtracker.retirejob.check", 60 * 1000);
    TASK_ALLOC_EPSILON = conf.getFloat("mapred.jobtracker.taskalloc.loadbalance.epsilon", 0.2f);
    PAD_FRACTION = conf.getFloat("mapred.jobtracker.taskalloc.capacitypad", 0.1f);
    MIN_SLOTS_FOR_PADDING = 3 * maxCurrentTasks;

    // This is a directory of temporary submission files.  We delete it
    // on startup, and can delete any files that we're done with
    this.conf = conf;
    JobConf jobConf = new JobConf(conf);
    this.systemDir = jobConf.getSystemDir();
    this.fs = FileSystem.get(conf);
    FileUtil.fullyDelete(fs, systemDir);
    fs.mkdirs(systemDir);

    // Same with 'localDir' except it's always on the local disk.
    jobConf.deleteLocalFiles(SUBDIR);

    // Set ports, start RPC servers, etc.
    InetSocketAddress addr = getAddress(conf);
    this.localMachine = addr.getHostName();
    this.port = addr.getPort();
    this.interTrackerServer = RPC.getServer(this, addr.getPort(), 10, false, conf);
    this.interTrackerServer.start();
    Properties p = System.getProperties();
    for (Iterator it = p.keySet().iterator(); it.hasNext(); ) {
      String key = (String) it.next();
      String val = (String) p.getProperty(key);
      LOG.info("Property '" + key + "' is " + val);
    }

    this.infoPort = conf.getInt("mapred.job.tracker.info.port", 50030);
    this.infoServer = new JobTrackerInfoServer(this, infoPort);
    this.infoServer.start();

    this.startTime = System.currentTimeMillis();

    new Thread(this.expireTrackers).start();
    new Thread(this.retireJobs).start();
    new Thread(this.initJobs).start();
  }
Example #6
0
 private static Path checkCacheStatusValidity(
     Configuration conf,
     URI cache,
     long confFileStamp,
     CacheStatus cacheStatus,
     FileStatus fileStatus,
     boolean isArchive)
     throws IOException {
   FileSystem fs = FileSystem.get(cache, conf);
   // Has to be
   if (!ifExistsAndFresh(conf, fs, cache, confFileStamp, cacheStatus, fileStatus)) {
     throw new IOException(
         "Stale cache file: " + cacheStatus.localizedLoadPath + " for cache-file: " + cache);
   }
   LOG.info(
       String.format(
           "Using existing cache of %s->%s", cache.toString(), cacheStatus.localizedLoadPath));
   return cacheStatus.localizedLoadPath;
 }
Example #7
0
  public BufferedReader loadDataFromFile(String filepath, long offset) {
    try {
      Path pt = new Path(filepath);
      FileSystem fs = FileSystem.get(fsConf);
      InputStreamReader isr;
      if (fs.isDirectory(pt)) { // multiple parts
        isr = new InputStreamReader(OpenMultiplePartsWithOffset(fs, pt, offset));
      } else { // single file
        FSDataInputStream fileHandler = fs.open(pt);
        if (offset > 0) fileHandler.seek(offset);
        isr = new InputStreamReader(fileHandler);
      }

      BufferedReader reader = new BufferedReader(isr);
      if (offset > 0) reader.readLine(); // skip first line in case of seek
      return reader;
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    }
    assert false : "Should not reach here!";
    return null;
  }
  /**
   * Test how IPC cache map works.
   *
   * @throws Exception If failed.
   */
  @SuppressWarnings("unchecked")
  public void testIpcCache() throws Exception {
    Field cacheField = GridGgfsHadoopIpcIo.class.getDeclaredField("ipcCache");

    cacheField.setAccessible(true);

    Field activeCntField = GridGgfsHadoopIpcIo.class.getDeclaredField("activeCnt");

    activeCntField.setAccessible(true);

    Map<String, GridGgfsHadoopIpcIo> cache =
        (Map<String, GridGgfsHadoopIpcIo>) cacheField.get(null);

    String name = "ggfs:" + getTestGridName(0) + "@";

    Configuration cfg = new Configuration();

    cfg.addResource(U.resolveGridGainUrl(HADOOP_FS_CFG));
    cfg.setBoolean("fs.ggfs.impl.disable.cache", true);
    cfg.setBoolean(String.format(GridGgfsHadoopUtils.PARAM_GGFS_ENDPOINT_NO_EMBED, name), true);

    // Ensure that existing IO is reused.
    FileSystem fs1 = FileSystem.get(new URI("ggfs://" + name + "/"), cfg);

    assertEquals(1, cache.size());

    GridGgfsHadoopIpcIo io = null;

    System.out.println("CACHE: " + cache);

    for (String key : cache.keySet()) {
      if (key.contains("10500")) {
        io = cache.get(key);

        break;
      }
    }

    assert io != null;

    assertEquals(1, ((AtomicInteger) activeCntField.get(io)).get());

    // Ensure that when IO is used by multiple file systems and one of them is closed, IO is not
    // stopped.
    FileSystem fs2 = FileSystem.get(new URI("ggfs://" + name + "/abc"), cfg);

    assertEquals(1, cache.size());
    assertEquals(2, ((AtomicInteger) activeCntField.get(io)).get());

    fs2.close();

    assertEquals(1, cache.size());
    assertEquals(1, ((AtomicInteger) activeCntField.get(io)).get());

    Field stopField = GridGgfsHadoopIpcIo.class.getDeclaredField("stopping");

    stopField.setAccessible(true);

    assert !(Boolean) stopField.get(io);

    // Ensure that IO is stopped when nobody else is need it.
    fs1.close();

    assert cache.isEmpty();

    assert (Boolean) stopField.get(io);
  }
Example #9
0
 private static FileSystem getFileSystem(URI cache, Configuration conf) throws IOException {
   String fileSysName = getFileSysName(cache);
   if (fileSysName != null) return FileSystem.getNamed(fileSysName, conf);
   else return FileSystem.get(conf);
 }
Example #10
0
  /**
   * Returns the status of a given cache file on hdfs.
   *
   * @param conf configuration
   * @param cache cache file
   * @return FileStatus object of the file
   * @throws IOException
   */
  public static FileStatus getFileStatus(Configuration conf, URI cache) throws IOException {
    FileSystem fileSystem = FileSystem.get(cache, conf);
    Path filePath = new Path(cache.getPath());

    return fileSystem.getFileStatus(filePath);
  }
Example #11
0
  /**
   * Returns mtime of a given cache file on hdfs.
   *
   * @param conf configuration
   * @param cache cache file
   * @return mtime of a given cache file on hdfs
   * @throws IOException
   */
  public static long getTimestamp(Configuration conf, URI cache) throws IOException {
    FileSystem fileSystem = FileSystem.get(cache, conf);
    Path filePath = new Path(cache.getPath());

    return fileSystem.getFileStatus(filePath).getModificationTime();
  }
Example #12
0
  public int run(String[] args) throws Exception {
    // printUsage();
    /*
     * SETUP
     */
    Configuration argConf = getConf();
    Hashtable<String, String> confArg = new Hashtable<String, String>();
    setup(confArg, argConf);
    Date currentTime = new Date();
    Date endDate = new Date(new Long(confArg.get("timestamp_stop")));
    Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*");
    Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*");
    logger.info("Running GeStore");

    // ZooKeeper setup
    Configuration config = HBaseConfiguration.create();
    zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config));
    zkInstance =
        new ZooKeeper(
            ZKConfig.getZKQuorumServersString(config),
            config.getInt("zookeeper.session.timeout", -1),
            zkWatcher);

    if (!confArg.get("task_id").isEmpty()) {
      confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id"));
    }

    String lockRequest = confArg.get("file_id");
    if (!confArg.get("run_id").isEmpty())
      lockRequest = lockRequest + "_" + confArg.get("run_id") + "_";
    if (!confArg.get("task_id").isEmpty())
      lockRequest = lockRequest + "_" + confArg.get("task_id") + "_";

    // Get type of movement
    toFrom type_move = checkArgs(confArg);
    if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) {
      List<String> arguments = new ArrayList<String>();
      arguments.add("-Dinput=" + confArg.get("local_path"));
      arguments.add("-Dtable=" + confArg.get("file_id"));
      arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop"));
      arguments.add("-Dtype=" + confArg.get("format"));
      arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id"));
      arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path"));
      arguments.add("-Drun_id=" + confArg.get("run_id"));
      if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id"));
      if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id"));
      if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add"));
      String lockName = lock(lockRequest);
      String[] argumentString = arguments.toArray(new String[arguments.size()]);
      adddb.main(argumentString);
      unlock(lockName);
      System.exit(0);
    }

    // Database registration

    dbutil db_util = new dbutil(config);
    db_util.register_database(confArg.get("db_name_files"), true);
    db_util.register_database(confArg.get("db_name_runs"), true);
    db_util.register_database(confArg.get("db_name_updates"), true);
    FileSystem hdfs = FileSystem.get(config);
    FileSystem localFS = FileSystem.getLocal(config);

    // Get source type
    confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id")));
    confArg.put(
        "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id")));
    if (!confArg.get("source").equals("local")
        && type_move == toFrom.REMOTE2LOCAL
        && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) {
      confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util)));
    }

    /*
     * Get previous timestamp
     */
    Get run_id_get = new Get(confArg.get("run_id").getBytes());
    Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get);
    KeyValue run_file_prev =
        run_get.getColumnLatest(
            "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes());
    String last_timestamp = new String("0");
    if (null != run_file_prev && !confArg.get("source").equals("local")) {
      long last_timestamp_real = run_file_prev.getTimestamp();
      Long current_timestamp = new Long(confArg.get("timestamp_real"));
      if ((current_timestamp - last_timestamp_real) > 36000) {
        last_timestamp = new String(run_file_prev.getValue());
        Integer lastTimestamp = new Integer(last_timestamp);
        lastTimestamp += 1;
        last_timestamp = lastTimestamp.toString();
        logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate);
        Date last_run = new Date(run_file_prev.getTimestamp());
        if (last_run.before(endDate) && !full_run) {
          confArg.put("timestamp_start", last_timestamp);
        }
      }
    }

    Integer tse = new Integer(confArg.get("timestamp_stop"));
    Integer tss = new Integer(confArg.get("timestamp_start"));
    if (tss > tse) {
      logger.info("No new version of requested file.");
      return 0;
    }

    /*
     * Generate file
     */

    String lockName = lock(lockRequest);

    Get file_id_get = new Get(confArg.get("file_id").getBytes());
    Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get);
    if (!file_get.isEmpty()) {
      boolean found =
          hasFile(
              db_util,
              hdfs,
              confArg.get("db_name_files"),
              confArg.get("file_id"),
              getFullPath(confArg));
      if (confArg.get("source").equals("fullfile")) {
        found = false;
      }
      String filenames_put =
          getFileNames(
              db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg));
      // Filename not found in file database
      if (!found && type_move == toFrom.REMOTE2LOCAL) {
        if (!confArg.get("source").equals("local")) {
          // Generate intermediate file
          if (getFile(hdfs, confArg, db_util) == null) {
            unlock(lockName);
            return 1;
          }
          // Put generated file into file database
          if (!confArg.get("format").equals("fullfile")) {
            putFileEntry(
                db_util,
                hdfs,
                confArg.get("db_name_files"),
                confArg.get("file_id"),
                confArg.get("full_file_name"),
                confArg.get("source"));
          }
        } else {
          logger.warn("Remote file not found, and cannot be generated! File: " + confArg);
          unlock(lockName);
          return 1;
        }
      }
    } else {
      if (type_move == toFrom.REMOTE2LOCAL) {
        logger.warn("Remote file not found, and cannot be generated.");
        unlock(lockName);
        return 1;
      }
    }

    /*
     * Copy file
     * Update tables
     */

    if (type_move == toFrom.LOCAL2REMOTE) {
      if (!confArg.get("format").equals("fullfile")) {
        putFileEntry(
            db_util,
            hdfs,
            confArg.get("db_name_files"),
            confArg.get("file_id"),
            getFullPath(confArg),
            confArg.get("source"));
      }
      putRunEntry(
          db_util,
          confArg.get("db_name_runs"),
          confArg.get("run_id"),
          confArg.get("file_id"),
          confArg.get("type"),
          confArg.get("timestamp_real"),
          confArg.get("timestamp_stop"),
          getFullPath(confArg),
          confArg.get("delimiter"));
      hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg)));
    } else if (type_move == toFrom.REMOTE2LOCAL) {
      FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*"));
      putRunEntry(
          db_util,
          confArg.get("db_name_runs"),
          confArg.get("run_id"),
          confArg.get("file_id"),
          confArg.get("type"),
          confArg.get("timestamp_real"),
          confArg.get("timestamp_stop"),
          getFullPath(confArg),
          confArg.get("delimiter"));
      unlock(lockName);
      for (FileStatus file : files) {
        Path cur_file = file.getPath();
        Path cur_local_path =
            new Path(new String(confArg.get("local_path") + confArg.get("file_id")));
        String suffix = getSuffix(getFileName(confArg), cur_file.getName());
        if (suffix.length() > 0) {
          cur_local_path = cur_local_path.suffix(new String("." + suffix));
        }
        if (confArg.get("copy").equals("true")) {
          String crc = hdfs.getFileChecksum(cur_file).toString();
          if (checksumLocalTest(cur_local_path, crc)) {
            continue;
          } else {
            hdfs.copyToLocalFile(cur_file, cur_local_path);
            writeChecksum(cur_local_path, crc);
          }
        } else {
          System.out.println(cur_local_path + "\t" + cur_file);
        }
      }
    }
    unlock(lockName);
    return 0;
  }