예제 #1
0
  public void copyInitialState(Path origAppDir) throws IOException {
    // locate previous snapshot
    String newAppDir = this.dag.assertAppPath();

    FSRecoveryHandler recoveryHandler = new FSRecoveryHandler(origAppDir.toString(), conf);
    // read snapshot against new dependencies
    Object snapshot = recoveryHandler.restore();
    if (snapshot == null) {
      throw new IllegalArgumentException("No previous application state found in " + origAppDir);
    }
    InputStream logIs = recoveryHandler.getLog();

    // modify snapshot state to switch app id
    ((StreamingContainerManager.CheckpointState) snapshot).setApplicationId(this.dag, conf);
    Path checkpointPath = new Path(newAppDir, LogicalPlan.SUBDIR_CHECKPOINTS);

    FileSystem fs = FileSystem.newInstance(origAppDir.toUri(), conf);
    // remove the path that was created by the storage agent during deserialization and replacement
    fs.delete(checkpointPath, true);

    // write snapshot to new location
    recoveryHandler = new FSRecoveryHandler(newAppDir, conf);
    recoveryHandler.save(snapshot);
    OutputStream logOs = recoveryHandler.rotateLog();
    IOUtils.copy(logIs, logOs);
    logOs.flush();
    logOs.close();
    logIs.close();

    // copy sub directories that are not present in target
    FileStatus[] lFiles = fs.listStatus(origAppDir);
    for (FileStatus f : lFiles) {
      if (f.isDirectory()) {
        String targetPath = f.getPath().toString().replace(origAppDir.toString(), newAppDir);
        if (!fs.exists(new Path(targetPath))) {
          LOG.debug("Copying {} to {}", f.getPath(), targetPath);
          FileUtil.copy(fs, f.getPath(), fs, new Path(targetPath), false, conf);
          // FSUtil.copy(fs, f, fs, new Path(targetPath), false, false, conf);
        } else {
          LOG.debug("Ignoring {} as it already exists under {}", f.getPath(), targetPath);
          // FSUtil.setPermission(fs, new Path(targetPath), new FsPermission((short)0777));
        }
      }
    }
  }
 public static void recursePath(Configuration conf, Path path, Job job) {
   try {
     FileSystem fs = path.getFileSystem(conf);
     FileStatus[] fstats = fs.listStatus(path);
     if (fstats != null) {
       for (FileStatus f : fstats) {
         Path p = f.getPath();
         ;
         if (fs.isFile(p)) {
           // connection times out otherwise
           System.err.println("file:" + p.toString());
           FileInputFormat.addInputPath(job, p);
         } else {
           System.err.println("dir:" + p.toString());
           recursePath(conf, p, job);
         }
       }
     }
   } catch (IOException e) {
     // shouldn't be here
     throw new RuntimeException(e);
   }
 }
예제 #3
0
  public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

    if (args.length != 2) {
      System.out.println("Usage: FeatureMatching ID <inputName.jpeg/inputName.jpg>");
      System.exit(1);
    }

    SimpleDateFormat sdf = new SimpleDateFormat("", Locale.US);
    sdf.applyPattern("yyyy-MM-dd_HH-mm-ss");
    String time = sdf.format(new Date());

    Job job = Job.getInstance();

    ID = "/" + args[0];
    String filename = args[1];
    filename = filename.toLowerCase();
    System.out.println("current filename:" + filename);

    // Detect illegal username (if the username dir doesn't exist)
    File userPath = new File(LOCAL_USER_DIR + ID);
    if (!userPath.exists()) {
      System.out.println("Unauthorized username!!!\nExiting......");
      System.exit(1);
    }
    // Preprocess the input image.jpg from local dir: /local.../user/ID/input/image.jpg
    // Save the features to local dir: hdfs://.../user/ID/input/image.jpg
    extractQueryFeatures2HDFS(filename, job);

    // Add the feature file to the hdfs cache
    String featureFileName = filename.substring(0, filename.lastIndexOf(".")) + ".json";
    //        job.addCacheFile(new Path(HDFS_HOME + USER + ID + INPUT + "/" +
    // featureFileName).toUri());
    job.getConfiguration()
        .set("featureFilePath", HDFS_HOME + USER + ID + INPUT + "/" + featureFileName);

    // Check the file type. Only support jpeg/jpg type images
    String type = filename.substring(args[1].lastIndexOf("."));
    if (!(type.equals(".jpg") || type.equals(".jpeg"))) {
      System.out.println("Image type not supported!!!\nExiting");
      System.exit(1);
    }

    // Input: hdfs://.../features/
    // The feature dir is a location of all features extracted from the database
    String inputPathStr = HDFS_HOME + FEATURES;
    // Output: hdfs://.../user/ID/output/
    String outputPathStr = HDFS_HOME + USER + ID + OUTPUT + "/" + time;

    job.setInputFormatClass(KeyValueTextInputFormat.class);
    //        job.setOutputFormatClass(TextOutputFormat.class);

    // Get the lists of all feature files: /.../features/data/part-*
    FileSystem fs = FileSystem.get(job.getConfiguration());
    FileStatus[] statuses = fs.listStatus(new Path(inputPathStr));
    StringBuffer sb = new StringBuffer();
    for (FileStatus fileStatus : statuses) {
      sb.append(fileStatus.getPath() + ",");
    }
    sb.deleteCharAt(sb.lastIndexOf(","));

    job.setJarByClass(FeatureMatching.class);
    job.setMapperClass(FeatureMatchMapper.class);
    job.setReducerClass(FeatureMatchReducer.class);

    // only need one reducer to collect the result
    job.setNumReduceTasks(1);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DoubleWritable.class);

    // Input a directory, so need the recursive input
    FileInputFormat.setInputDirRecursive(job, true);
    // Set the PathFilter, to omit _SUCCESS files
    // (This is not working correctly, as the PathFilter class is an interface rather than a class.
    // But the 2nd arg asks me to extend the PathFilter)
    //        FileInputFormat.setInputPathFilter(job, MyPathFilter.class);
    //
    //        FileInputFormat.setInputPaths(job, new Path(inputPathStr));
    FileInputFormat.setInputPaths(job, sb.toString());
    FileOutputFormat.setOutputPath(job, new Path(outputPathStr));

    boolean success = job.waitForCompletion(true);
    return success ? 0 : 1;
  }
  /**
   * Method to go though the HDFS filesystem in a DFS to find all files
   *
   * <p>fs:FileSystem object from HDFS minDate: Oldest date for files to be backed up maxDate:Newest
   * date for files to be backed up p:Path in HDFS to look for files pathList:Will be filled with
   * all files in p hmTimestamps: hashmap of timestamps for later sorting
   */
  public void checkDir(
      FileSystem fs,
      long minDate,
      long maxDate,
      Path p,
      ArrayList<Path> pathList,
      HashMap<Path, Long> hmTimestamps) {
    long tmpDate;
    FileStatus[] fStat;

    try {
      String sPath = p.toUri().getPath();

      // If this is a directory
      if (fs.getFileStatus(p).isDir()) {
        // ignore certain directories
        if ("dfstmp".equals(p.getName())
            || "tmp".equals(p.getName())
            || "jobtracker".equals(p.getName())
            || sPath.startsWith("/mapred")
            || "ops".equals(p.getName())
            || p.getName().startsWith("_distcp_logs")) {
          return;
        }

        // dump the mkdir and chmod commands for this
        // directory -- skip root directory only
        {
          FileStatus stat = fs.getFileStatus(p);

          if (!sPath.equals("/")) {
            m_wrMkdirs.println("hadoop fs -mkdir " + sPath);
          }

          m_wrChmods.println(
              "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath);

          Short sh = new Short(stat.getPermission().toShort());
          m_wrChmods.println(
              "hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath);
        }

        fStat = fs.listStatus(p);

        // Do a recursive call to all elements
        for (int i = 0; i < fStat.length; i++) {
          checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps);
        }
      } else {
        // If not a directory then we've found a file

        // ignore crc files
        if (p.getName().endsWith(".crc")) {
          return;
        }

        // ignore other files
        if (sPath.startsWith("/user/oozie/etl/workflows/")) {
          return;
        }

        // try to get the table name from the path. There are
        // various types of tables, from those replicated from
        // another database to regular hive tables to
        // partitioned hive tables.  We use table names to
        // both exclude some from the backup, and for the rest
        // to dump out the schema and partition name.
        if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) {
          m_nIgnoredTables++;

          if (m_nIgnoredTables < 5) {
            System.out.println("Skipping ignore-table file: " + sPath);
          } else if (m_nIgnoredTables == 5) {
            System.out.println("(...not showing other skipped tables...)");
          }
          return;
        }

        FileStatus stat = fs.getFileStatus(p);

        tmpDate = stat.getModificationTime() / 1000;

        // store the chmods/chowns for all files
        m_wrChmods.println(
            "hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath);

        m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath);

        // check dates.  is it too young?
        if (tmpDate < minDate) {
          return;
        }

        // is the file too recent?
        if (tmpDate > maxDate) {
          // System.out.println("file too recent: " + sPath);
          return;
        }

        // file timestamp is ok
        pathList.add(p);

        hmTimestamps.put(p, new Long(tmpDate));

        // store info about total bytes neeed to backup
        m_nTotalBytes += fs.getContentSummary(p).getLength();
      }
    } catch (IOException e) {
      System.err.println("ERROR: could not open " + p + ": " + e);

      // System.exit(1) ;
    }
  }
예제 #5
0
  public void doTestTextBatchAppend() throws Exception {
    LOG.debug("Starting...");

    final long rollCount = 10;
    final long batchSize = 2;
    final String fileName = "PageView";

    String newPath = testPath + "/singleTextBucket";
    int totalEvents = 0;
    int i = 1, j = 1;

    // clear the test directory
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path dirPath = new Path(newPath);
    fs.delete(dirPath, true);
    fs.mkdirs(dirPath);

    Context context = new Context();
    context.put("hdfs.path", newPath);
    context.put("hdfs.rollCount", String.valueOf(rollCount));
    context.put("hdfs.batchSize", String.valueOf(batchSize));
    context.put("hdfs.filePrefix", "pageview");

    Channel channel = new MemoryChannel();
    Configurables.configure(channel, context);
    sink.setChannel(channel);
    sink.start();

    Calendar eventDate = Calendar.getInstance();
    Date currentDate = new Date();
    Map<String, String> header = new HashMap<String, String>();
    header.put("topic", "PageView");

    List<String> bodies = Lists.newArrayList();

    // 将测试的事件推入到通道中
    for (i = 1; i <= (rollCount * 10) / batchSize; i++) {
      Transaction txn = channel.getTransaction();
      txn.begin();
      for (j = 1; j <= batchSize; j++) {
        header.put("timestamp", String.valueOf(currentDate.getTime()));
        Event event = new SimpleEvent();
        eventDate.clear();
        eventDate.set(2014, i, i, i, 0);
        String body = "Test." + i + "." + j;

        event.setHeaders(header);
        event.setBody(body.getBytes());
        bodies.add(body);
        channel.put(event);
        totalEvents++;
      }
      txn.commit();
      txn.close();

      // execute sink to process the events
      sink.process();
    }
    sink.stop();

    FileStatus[] dirStat = fs.listStatus(dirPath);
    Path fList[] = FileUtil.stat2Paths(dirStat);

    long expectedFiles = totalEvents / rollCount;
    if (totalEvents % rollCount > 0) {
      expectedFiles++;
    }

    Assert.assertEquals(
        "num files wrong, found: " + Lists.newArrayList(fList), expectedFiles, fList.length);
    // 检查所有写入文件的内容
    verifyOutputTextFiles(fs, conf, dirPath.toUri().getPath(), fileName, bodies);
  }
예제 #6
0
    static void checkRecords(Configuration defaults, Path sortInput, Path sortOutput)
        throws IOException {
      FileSystem inputfs = sortInput.getFileSystem(defaults);
      FileSystem outputfs = sortOutput.getFileSystem(defaults);
      FileSystem defaultfs = FileSystem.get(defaults);
      JobConf jobConf = new JobConf(defaults, RecordStatsChecker.class);
      jobConf.setJobName("sortvalidate-recordstats-checker");

      int noSortReduceTasks = outputfs.listStatus(sortOutput, sortPathsFilter).length;
      jobConf.setInt("sortvalidate.sort.reduce.tasks", noSortReduceTasks);
      int noSortInputpaths = inputfs.listStatus(sortInput).length;

      jobConf.setInputFormat(NonSplitableSequenceFileInputFormat.class);
      jobConf.setOutputFormat(SequenceFileOutputFormat.class);

      jobConf.setOutputKeyClass(IntWritable.class);
      jobConf.setOutputValueClass(RecordStatsChecker.RecordStatsWritable.class);

      jobConf.setMapperClass(Map.class);
      jobConf.setCombinerClass(Reduce.class);
      jobConf.setReducerClass(Reduce.class);

      jobConf.setNumMapTasks(noSortReduceTasks);
      jobConf.setNumReduceTasks(1);

      FileInputFormat.setInputPaths(jobConf, sortInput);
      FileInputFormat.addInputPath(jobConf, sortOutput);
      Path outputPath = new Path("/tmp/sortvalidate/recordstatschecker");
      if (defaultfs.exists(outputPath)) {
        defaultfs.delete(outputPath, true);
      }
      FileOutputFormat.setOutputPath(jobConf, outputPath);

      // Uncomment to run locally in a single process
      // job_conf.set("mapred.job.tracker", "local");
      Path[] inputPaths = FileInputFormat.getInputPaths(jobConf);
      System.out.println(
          "\nSortValidator.RecordStatsChecker: Validate sort "
              + "from "
              + inputPaths[0]
              + " ("
              + noSortInputpaths
              + " files), "
              + inputPaths[1]
              + " ("
              + noSortReduceTasks
              + " files) into "
              + FileOutputFormat.getOutputPath(jobConf)
              + " with 1 reducer.");
      Date startTime = new Date();
      System.out.println("Job started: " + startTime);
      JobClient.runJob(jobConf);
      Date end_time = new Date();
      System.out.println("Job ended: " + end_time);
      System.out.println(
          "The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

      // Check to ensure that the statistics of the
      // framework's sort-input and sort-output match
      SequenceFile.Reader stats =
          new SequenceFile.Reader(defaultfs, new Path(outputPath, "part-00000"), defaults);
      IntWritable k1 = new IntWritable();
      IntWritable k2 = new IntWritable();
      RecordStatsWritable v1 = new RecordStatsWritable();
      RecordStatsWritable v2 = new RecordStatsWritable();
      if (!stats.next(k1, v1)) {
        throw new IOException("Failed to read record #1 from reduce's output");
      }
      if (!stats.next(k2, v2)) {
        throw new IOException("Failed to read record #2 from reduce's output");
      }

      if ((v1.getBytes() != v2.getBytes())
          || (v1.getRecords() != v2.getRecords())
          || v1.getChecksum() != v2.getChecksum()) {
        throw new IOException(
            "("
                + v1.getBytes()
                + ", "
                + v1.getRecords()
                + ", "
                + v1.getChecksum()
                + ") v/s ("
                + v2.getBytes()
                + ", "
                + v2.getRecords()
                + ", "
                + v2.getChecksum()
                + ")");
      }
    }