@SuppressWarnings("deprecation")
  private void createControlFile(
      FileSystem fs,
      long nrBytes, // in bytes
      int nrFiles)
      throws IOException {
    LOG.info("creating control file: " + nrBytes + " bytes, " + nrFiles + " files");

    Path controlDir = getControlDir(config);
    fs.delete(controlDir, true);

    for (int i = 0; i < nrFiles; i++) {
      String name = getFileName(i);
      Path controlFile = new Path(controlDir, "in_file_" + name);
      SequenceFile.Writer writer = null;
      try {
        writer =
            SequenceFile.createWriter(
                fs, config, controlFile, Text.class, LongWritable.class, CompressionType.NONE);
        writer.append(new Text(name), new LongWritable(nrBytes));
      } catch (Exception e) {
        throw new IOException(e.getLocalizedMessage());
      } finally {
        if (writer != null) writer.close();
        writer = null;
      }
    }
    LOG.info("created control files for: " + nrFiles + " files");
  }
Esempio n. 2
0
 public static void createCentersSequenceFile(
     Configuration conf, FileSystem fs, String centroidsPath, String sequenceFilePath)
     throws Exception {
   Path seqFile = new Path(sequenceFilePath);
   if (fs.exists(seqFile)) {
     fs.delete(seqFile, true);
   }
   FSDataInputStream inputStream = fs.open(new Path(centroidsPath));
   SequenceFile.Writer writer =
       SequenceFile.createWriter(fs, conf, seqFile, Centroid.class, IntWritable.class);
   IntWritable value = new IntWritable(0);
   while (inputStream.available() > 0) {
     String line = inputStream.readLine();
     StringTokenizer tokenizer = new StringTokenizer(line, " ");
     int dim = tokenizer.countTokens() - 1;
     int clusterId = Integer.valueOf(tokenizer.nextToken());
     double[] coords = new double[dim];
     for (int i = 0; i < dim; i++) {
       coords[i] = Double.valueOf(tokenizer.nextToken());
     }
     Centroid cluster = new Centroid(clusterId, new Point(coords));
     writer.append(cluster, value);
   }
   IOUtils.closeStream(writer);
   inputStream.close();
 }
Esempio n. 3
0
  private static void createControlFile(
      FileSystem fs,
      int fileSize, // in MB
      int nrFiles)
      throws IOException {
    LOG.info("creating control file: " + fileSize + " mega bytes, " + nrFiles + " files");

    fs.delete(CONTROL_DIR, true);

    for (int i = 0; i < nrFiles; i++) {
      String name = getFileName(i);
      Path controlFile = new Path(CONTROL_DIR, "in_file_" + name);
      SequenceFile.Writer writer = null;
      try {
        writer =
            SequenceFile.createWriter(
                fs, fsConfig, controlFile, Text.class, LongWritable.class, CompressionType.NONE);
        writer.append(new Text(name), new LongWritable(fileSize));
      } catch (Exception e) {
        throw new IOException(e.getLocalizedMessage());
      } finally {
        if (writer != null) writer.close();
        writer = null;
      }
    }
    LOG.info("created control files for: " + nrFiles + " files");
  }
  @Override
  public boolean write(String uri, InputStream stream) {
    SequenceFile.Writer writer = getWriterFor(uri);

    try {
      int size = stream.available();
      byte[] bytes = new byte[size];
      int readBytes = stream.read(bytes);
      if (readBytes != size) {
        log.error(
            "Could not read all the bytes from the inputStream. Read "
                + readBytes
                + " instead of "
                + size);
        return false;
      }
      HDFSByteChunk byteChunk = new HDFSByteChunk(bytes, uri);
      writer.append(new IntWritable(0), byteChunk);
      writer.close();
      return true;
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
  }
  public static void main(String args[]) throws Exception {
    if (args.length != 2) {
      System.err.println("argumentos: dir-de-entrada arquivo-de-saida");
      System.exit(1);
    }

    FileSystem fs = FileSystem.get(confHadoop);
    Path inPath = new Path(args[0]);
    Path outPath = new Path(args[1] + "/dataset");
    FSDataInputStream in = null;
    SequenceFile.Writer writer = null;
    List<Path> files = listFiles(inPath, jpegFilter);
    try {
      writer = SequenceFile.createWriter(fs, confHadoop, outPath, Text.class, BytesWritable.class);
      for (Path p : files) {
        in = fs.open(p);
        byte buffer[] = new byte[in.available()];
        in.readFully(buffer);
        writer.append(new Text(p.getName()), new BytesWritable(buffer));
        in.close();
      }
    } finally {
      IOUtils.closeStream(writer);
    }
  }
  private void writeSeqenceFileTest(
      FileSystem fs, int rowCount, Path file, int columnNum, CompressionCodec codec)
      throws IOException {

    byte[][] columnRandom;

    resetRandomGenerators();

    BytesRefArrayWritable bytes = new BytesRefArrayWritable(columnNum);
    columnRandom = new byte[columnNum][];
    for (int i = 0; i < columnNum; i++) {
      BytesRefWritable cu = new BytesRefWritable();
      bytes.set(i, cu);
    }

    // zero length key is not allowed by block compress writer, so we use a byte
    // writable
    ByteWritable key = new ByteWritable();
    SequenceFile.Writer seqWriter =
        SequenceFile.createWriter(
            fs,
            conf,
            file,
            ByteWritable.class,
            BytesRefArrayWritable.class,
            CompressionType.BLOCK,
            codec);

    for (int i = 0; i < rowCount; i++) {
      nextRandomRow(columnRandom, bytes);
      seqWriter.append(key, bytes);
    }
    seqWriter.close();
  }
Esempio n. 7
0
  /**
   * set up input file which has the list of input files.
   *
   * @return boolean
   * @throws IOException
   */
  private boolean setup() throws IOException {
    estimateSavings();

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobconf);
    Path jobdir = new Path(jClient.getSystemDir(), NAME + "_" + randomId);

    LOG.info(JOB_DIR_LABEL + "=" + jobdir);
    jobconf.set(JOB_DIR_LABEL, jobdir.toString());
    Path log = new Path(jobdir, "_logs");

    // The control file should have small size blocks. This helps
    // in spreading out the load from mappers that will be spawned.
    jobconf.setInt("dfs.blocks.size", OP_LIST_BLOCK_SIZE);

    FileOutputFormat.setOutputPath(jobconf, log);
    LOG.info("log=" + log);

    // create operation list
    FileSystem fs = jobdir.getFileSystem(jobconf);
    Path opList = new Path(jobdir, "_" + OP_LIST_LABEL);
    jobconf.set(OP_LIST_LABEL, opList.toString());
    int opCount = 0, synCount = 0;
    SequenceFile.Writer opWriter = null;

    try {
      opWriter =
          SequenceFile.createWriter(
              fs, jobconf, opList, Text.class, PolicyInfo.class, SequenceFile.CompressionType.NONE);
      for (RaidPolicyPathPair p : raidPolicyPathPairList) {
        // If a large set of files are Raided for the first time, files
        // in the same directory that tend to have the same size will end up
        // with the same map. This shuffle mixes things up, allowing a better
        // mix of files.
        java.util.Collections.shuffle(p.srcPaths);
        for (FileStatus st : p.srcPaths) {
          opWriter.append(new Text(st.getPath().toString()), p.policy);
          opCount++;
          if (++synCount > SYNC_FILE_MAX) {
            opWriter.sync();
            synCount = 0;
          }
        }
      }

    } finally {
      if (opWriter != null) {
        opWriter.close();
      }
      fs.setReplication(opList, OP_LIST_REPLICATION); // increase replication for control file
    }
    raidPolicyPathPairList.clear();

    jobconf.setInt(OP_COUNT_LABEL, opCount);
    LOG.info("Number of files=" + opCount);
    jobconf.setNumMapTasks(
        getMapCount(opCount, new JobClient(jobconf).getClusterStatus().getTaskTrackers()));
    LOG.info("jobName= " + jobName + " numMapTasks=" + jobconf.getNumMapTasks());
    return opCount != 0;
  }
Esempio n. 8
0
 private void close() throws IOException {
   for (SequenceFile.Writer writer : writers.values()) {
     writer.close();
   }
   writers.clear();
   LOG.info("closed writer");
 }
  public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub
    String uri = args[0];

    Configuration conf = new Configuration();

    Path path = new Path(uri);

    FileSystem fs = FileSystem.get(URI.create(uri), conf);

    IntWritable key = new IntWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {
      writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());

      for (int i = 0; i < 100; i++) {
        key.set(100 - i);
        value.set(DATA[i % DATA.length]);

        writer.append(key, value);
      }
    } finally {
      IOUtils.closeStream(writer);
    }
  }
  private void writeToFileListing(
      SequenceFile.Writer fileListWriter,
      FileStatus fileStatus,
      Path sourcePathRoot,
      boolean localFile)
      throws IOException {
    if (fileStatus.getPath().equals(sourcePathRoot) && fileStatus.isDir())
      return; // Skip the root-paths.

    if (LOG.isDebugEnabled()) {
      LOG.debug(
          "REL PATH: "
              + DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath())
              + ", FULL PATH: "
              + fileStatus.getPath());
    }

    FileStatus status = fileStatus;
    if (localFile) {
      status = getFileStatus(fileStatus);
    }

    fileListWriter.append(
        new Text(DistCpUtils.getRelativePath(sourcePathRoot, fileStatus.getPath())), status);
    fileListWriter.sync();

    if (!fileStatus.isDir()) {
      totalBytesToCopy += fileStatus.getLen();
    }
    totalPaths++;
  }
  public static void createControlFile(FileSystem fs, long megaBytes, int numFiles, long seed)
      throws Exception {

    LOG.info("creating control file: " + megaBytes + " bytes, " + numFiles + " files");

    Path controlFile = new Path(CONTROL_DIR, "files");
    fs.delete(controlFile, true);
    Random random = new Random(seed);

    SequenceFile.Writer writer =
        SequenceFile.createWriter(
            fs, conf, controlFile, Text.class, LongWritable.class, CompressionType.NONE);

    long totalSize = 0;
    long maxSize = ((megaBytes / numFiles) * 2) + 1;
    try {
      while (totalSize < megaBytes) {
        Text name = new Text(Long.toString(random.nextLong()));

        long size = random.nextLong();
        if (size < 0) size = -size;
        size = size % maxSize;

        // LOG.info(" adding: name="+name+" size="+size);

        writer.append(name, new LongWritable(size));

        totalSize += size;
      }
    } finally {
      writer.close();
    }
    LOG.info("created control file for: " + totalSize + " bytes");
  }
Esempio n. 12
0
 /** return a mapping of expected keys -> records */
 private HashMap<String, Record> createTextSequenceFile(File file, int numRecords)
     throws IOException {
   HashMap<String, Record> map = new HashMap<String, Record>();
   SequenceFile.Metadata metadata = new SequenceFile.Metadata(getMetadataForSequenceFile());
   FSDataOutputStream out = new FSDataOutputStream(new FileOutputStream(file), null);
   SequenceFile.Writer writer = null;
   try {
     writer =
         SequenceFile.createWriter(
             new Configuration(),
             out,
             Text.class,
             Text.class,
             SequenceFile.CompressionType.NONE,
             null,
             metadata);
     for (int i = 0; i < numRecords; ++i) {
       Text key = new Text("key" + i);
       Text value = new Text("value" + i);
       writer.append(key, value);
       Record record = new Record();
       record.put("key", key);
       record.put("value", value);
       map.put(key.toString(), record);
     }
   } finally {
     Closeables.closeQuietly(writer);
   }
   return map;
 }
Esempio n. 13
0
    @SuppressWarnings("deprecation")
    @Override
    protected void cleanup(Context context) throws IOException, InterruptedException {
      super.cleanup(context);
      List<Cluster> newKMeansClusters = new ArrayList<Cluster>();
      List<Cluster> newCanopyClusters = new ArrayList<Cluster>();

      for (Cluster kMeansCluster : _clusters.keySet()) {
        Cluster canopyCluster = _kMeansToCanopyMap.get(kMeansCluster);

        // Set a new Cluster center
        Vector center = new Vector();
        center.setElements(new double[kMeansCluster.getCenterVector().getElements().length]);
        List<Vector> vectors = new ArrayList<Vector>();

        for (Vector currentVector : _clusters.get(kMeansCluster)) {
          vectors.add(new Vector(currentVector));

          // Sums the vectors to a new vector in order to find the one that is the closest to all
          // others, it will be our new cluster center.
          for (int i = 0; i < currentVector.getElements().length; i++)
            center.getElements()[i] += currentVector.getElements()[i];
        }

        // Divides the vector's elements in order to find its real location (it will be a fictive
        // vector)
        for (int i = 0; i < center.getElements().length; i++)
          center.getElements()[i] = center.getElements()[i] / vectors.size();

        Cluster newKMeansCluster = new Cluster(center);
        canopyCluster.setIsCovered(newKMeansCluster.isConvergedWithOtherCluster(kMeansCluster));
        newKMeansClusters.add(newKMeansCluster);
        newCanopyClusters.add(canopyCluster);

        // Adding the vectors to the new cluster center
        for (Vector vector : vectors) {
          context.write(newKMeansCluster, vector);
        }
      }

      Configuration conf = context.getConfiguration();
      Path outPath = new Path(conf.get("centers.path"));
      FileSystem fs = FileSystem.get(conf);

      if (fs.exists(outPath)) fs.delete(outPath, true);

      SequenceFile.Writer writer =
          SequenceFile.createWriter(
              fs, context.getConfiguration(), outPath, Cluster.class, Cluster.class);
      context.getCounter(Counter.CONVERGED).setValue(0);

      for (int i = 0; i < newKMeansClusters.size(); i++) {
        writer.append(newCanopyClusters.get(i), newKMeansClusters.get(i));

        if (newCanopyClusters.get(i).getIsCovered())
          context.getCounter(Counter.CONVERGED).increment(1);
      }

      writer.close();
    }
 private static <T extends WritableComparable> Path writePartitionFile(
     String testname, JobConf conf, T[] splits) throws IOException {
   final FileSystem fs = FileSystem.getLocal(conf);
   final Path testdir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(fs);
   Path p = new Path(testdir, testname + "/_partition.lst");
   TotalOrderPartitioner.setPartitionFile(conf, p);
   conf.setNumReduceTasks(splits.length + 1);
   SequenceFile.Writer w = null;
   try {
     NullWritable nw = NullWritable.get();
     w =
         SequenceFile.createWriter(
             fs,
             conf,
             p,
             splits[0].getClass(),
             NullWritable.class,
             SequenceFile.CompressionType.NONE);
     for (int i = 0; i < splits.length; ++i) {
       w.append(splits[i], NullWritable.get());
     }
   } finally {
     if (null != w) w.close();
   }
   return p;
 }
  public static void writeCanopyCenters(Configuration conf, ArrayList<StockVector> canopyCenters)
      throws IOException {
    FileSystem fs = FileSystem.get(conf);

    IntWritable IntKey = new IntWritable(1);
    Path canopyFileName = new Path(Nasdaq.CANOPY_SEQ_FILE_PATH);

    System.out.println("before seq file");
    // create file
    @SuppressWarnings("deprecation")
    final SequenceFile.Writer writer =
        SequenceFile.createWriter(fs, conf, canopyFileName, StockVector.class, IntWritable.class);

    System.out.println("after seq file");
    System.out.println("canopies" + canopyCenters.size());
    for (StockVector canopyCenter : canopyCenters) {
      // write canopy to file
      writer.append(canopyCenter, IntKey);
      System.out.println("sum " + canopyCenter.GetSum());
    }
    System.out.println("canopies end" + canopyCenters.size());
    // close writer and file system
    writer.close();
    // fs.close();
  }
Esempio n. 16
0
  /**
   * Create a data file in SequenceFile format that gets exported to the db.
   *
   * @param fileNum the number of the file (for multi-file export).
   * @param numRecords how many records to write to the file.
   * @param className the table class name to instantiate and populate for each record.
   */
  private void createSequenceFile(int fileNum, int numRecords, String className)
      throws IOException {

    try {
      // Instantiate the value record object via reflection.
      Class cls = Class.forName(className, true, Thread.currentThread().getContextClassLoader());
      SqoopRecord record = (SqoopRecord) ReflectionUtils.newInstance(cls, new Configuration());

      // Create the SequenceFile.
      Configuration conf = new Configuration();
      conf.set("fs.default.name", "file:///");
      FileSystem fs = FileSystem.get(conf);
      Path tablePath = getTablePath();
      Path filePath = new Path(tablePath, "part" + fileNum);
      fs.mkdirs(tablePath);
      SequenceFile.Writer w =
          SequenceFile.createWriter(fs, conf, filePath, LongWritable.class, cls);

      // Now write the data.
      int startId = fileNum * numRecords;
      for (int i = 0; i < numRecords; i++) {
        record.parse(getRecordLine(startId + i));
        w.append(new LongWritable(startId + i), record);
      }

      w.close();
    } catch (ClassNotFoundException cnfe) {
      throw new IOException(cnfe);
    } catch (RecordParser.ParseError pe) {
      throw new IOException(pe);
    }
  }
Esempio n. 17
0
 /**
  * Write a partition file for the given job, using the Sampler provided. Queries the sampler for a
  * sample keyset, sorts by the output key comparator, selects the keys for each rank, and writes
  * to the destination returned from {@link TotalOrderPartitioner#getPartitionFile}.
  */
 @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
 public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
     throws IOException, ClassNotFoundException, InterruptedException {
   Configuration conf = job.getConfiguration();
   final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
   int numPartitions = job.getNumReduceTasks();
   K[] samples = sampler.getSample(inf, job);
   RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
   Arrays.sort(samples, comparator);
   Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
   FileSystem fs = dst.getFileSystem(conf);
   if (fs.exists(dst)) {
     fs.delete(dst, false);
   }
   SequenceFile.Writer writer =
       SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
   NullWritable nullValue = NullWritable.get();
   float stepSize = samples.length / (float) numPartitions;
   int last = -1;
   for (int i = 1; i < numPartitions; ++i) {
     int k = Math.round(stepSize * i);
     while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
       ++k;
     }
     writer.append(samples[k], nullValue);
     last = k;
   }
   writer.close();
 }
  @Before
  public void setUp() throws Exception {
    // create local Pig server
    pigServer = UnitTestUtil.makePigServer();

    // create temp SequenceFile
    File tempFile = File.createTempFile("test", ".txt");
    tempFilename = tempFile.getAbsolutePath();
    Path path = new Path("file:///" + tempFilename);
    Configuration conf = new Configuration();
    FileSystem fs = path.getFileSystem(conf);
    IntWritable key = new IntWritable();
    Text value = new Text();
    SequenceFile.Writer writer = null;
    try {
      writer = SequenceFile.createWriter(fs, conf, path, key.getClass(), value.getClass());
      for (int i = 0; i < DATA.length; ++i) {
        key.set(i);
        value.set(DATA[i]);
        writer.append(key, value);
      }
    } finally {
      IOUtils.closeStream(writer);
    }
  }
Esempio n. 19
0
  @Override
  public boolean write(String uri, List<InputStream> streams) {
    boolean result = false;
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
    try {
      for (InputStream stream : streams) {

        int size = stream.available();
        byte[] bytes = new byte[size];
        int readBytes = stream.read(bytes);
        if (readBytes != size) {
          log.error(
              "Could not read all the bytes from the inputStream. Read "
                  + readBytes
                  + " instead of "
                  + size);
          return false;
        }
        outputStream.write(bytes);
      }
      HDFSByteChunk byteChunk = new HDFSByteChunk(outputStream.toByteArray(), uri);
      SequenceFile.Writer writer = getWriterFor(uri);
      writer.append(new IntWritable(0), byteChunk);
      writer.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
    return result;
  }
  /**
   * Write out a SequenceFile that can be read by TotalOrderPartitioner that contains the split
   * points in startKeys.
   *
   * <p>This method was copied from HFileOutputFormat in hbase-0.90.1-cdh3u0. I had to copy it
   * because it's private.
   *
   * @param conf The job configuration.
   * @param partitionsPath output path for SequenceFile.
   * @param startKeys the region start keys to use as the partitions.
   * @throws IOException If there is an error.
   */
  private static void writePartitionFile(
      Configuration conf, Path partitionsPath, List<HFileKeyValue> startKeys) throws IOException {
    if (startKeys.isEmpty()) {
      throw new IllegalArgumentException("No regions passed");
    }

    // We're generating a list of split points, and we don't ever
    // have keys < the first region (which has an empty start key)
    // so we need to remove it. Otherwise we would end up with an
    // empty reducer with index 0.
    TreeSet<HFileKeyValue> sorted = new TreeSet<HFileKeyValue>();
    sorted.addAll(startKeys);

    HFileKeyValue first = sorted.first();
    if (0 != first.getRowKey().length) {
      throw new IllegalArgumentException(
          "First region of table should have empty start row key. Instead has: "
              + Bytes.toStringBinary(first.getRowKey()));
    }
    sorted.remove(first);

    // Write the actual file
    final SequenceFile.Writer writer =
        KijiMRPlatformBridge.get()
            .newSeqFileWriter(conf, partitionsPath, HFileKeyValue.class, NullWritable.class);

    try {
      for (HFileKeyValue startKey : sorted) {
        writer.append(startKey, NullWritable.get());
      }
    } finally {
      writer.close();
    }
  }
  /** creates the input file (containing the names of the files to be fixed */
  private List<String> createInputFile(
      String jobName, Path inDir, Map<String, Integer> corruptFilePriority, int priority)
      throws IOException {
    Path file = new Path(inDir, jobName + IN_FILE_SUFFIX);
    FileSystem fs = file.getFileSystem(getConf());
    SequenceFile.Writer fileOut =
        SequenceFile.createWriter(fs, getConf(), file, LongWritable.class, Text.class);
    long index = 0L;

    List<String> filesAdded = new ArrayList<String>();
    int count = 0;
    final long max = filesPerTask * BLOCKFIX_TASKS_PER_JOB;
    for (Map.Entry<String, Integer> entry : corruptFilePriority.entrySet()) {
      if (entry.getValue() != priority) {
        continue;
      }
      if (count >= max) {
        break;
      }
      String corruptFileName = entry.getKey();
      fileOut.append(new LongWritable(index++), new Text(corruptFileName));
      filesAdded.add(corruptFileName);
      count++;

      if (index % filesPerTask == 0) {
        fileOut.sync(); // create sync point to make sure we can split here
      }
    }

    fileOut.close();
    return filesAdded;
  }
 /**
  * @param state The final LanczosState to be serialized
  * @param outputPath The path (relative to the current Configuration's FileSystem) to save the
  *     output to.
  */
 public void serializeOutput(LanczosState state, Path outputPath) throws IOException {
   int numEigenVectors = state.getIterationNumber();
   log.info("Persisting {} eigenVectors and eigenValues to: {}", numEigenVectors, outputPath);
   Configuration conf = getConf() != null ? getConf() : new Configuration();
   FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
   SequenceFile.Writer seqWriter =
       new SequenceFile.Writer(fs, conf, outputPath, IntWritable.class, VectorWritable.class);
   try {
     IntWritable iw = new IntWritable();
     for (int i = 0; i < numEigenVectors; i++) {
       // Persist eigenvectors sorted by eigenvalues in descending order\
       NamedVector v =
           new NamedVector(
               state.getRightSingularVector(numEigenVectors - 1 - i),
               "eigenVector"
                   + i
                   + ", eigenvalue = "
                   + state.getSingularValue(numEigenVectors - 1 - i));
       Writable vw = new VectorWritable(v);
       iw.set(i);
       seqWriter.append(iw, vw);
     }
   } finally {
     Closeables.close(seqWriter, false);
   }
 }
Esempio n. 23
0
  @Override
  public boolean writeData(String uri, byte[] data) {
    /*
     * Delete the parent folder if the parent folder exists
     * */

    File f = new File(uri);
    if (f.getName().equals(storageConfiguration.getProperty("postfix"))) {
      f = f.getParentFile();
      Path file = new Path(String.valueOf(f));
      try {
        if (fileSystem.exists(file)) {
          fileSystem.delete(file, true);
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    }

    SequenceFile.Writer writer = getWriterFor(uri);
    HDFSByteChunk byteChunk = new HDFSByteChunk(data, uri);
    try {
      writer.append(new IntWritable(0), byteChunk);
      writer.close();
    } catch (IOException e) {
      e.printStackTrace();
      return false;
    }
    return true;
  }
Esempio n. 24
0
  /**
   * Read the document frequency List which is built at the end of the DF Count Job. This will use
   * constant memory and will run at the speed of your disk read
   */
  private static Pair<Long[], List<Path>> createDictionaryChunks(
      Path featureCountPath,
      Path dictionaryPathBase,
      Configuration baseConf,
      int chunkSizeInMegabytes)
      throws IOException {
    List<Path> chunkPaths = Lists.newArrayList();
    Configuration conf = new Configuration(baseConf);

    FileSystem fs = FileSystem.get(featureCountPath.toUri(), conf);

    long chunkSizeLimit = chunkSizeInMegabytes * 1024L * 1024L;
    int chunkIndex = 0;
    Path chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
    chunkPaths.add(chunkPath);
    SequenceFile.Writer freqWriter =
        new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);

    try {
      long currentChunkSize = 0;
      long featureCount = 0;
      long vectorCount = Long.MAX_VALUE;
      Path filesPattern = new Path(featureCountPath, OUTPUT_FILES_PATTERN);
      for (Pair<IntWritable, LongWritable> record :
          new SequenceFileDirIterable<IntWritable, LongWritable>(
              filesPattern, PathType.GLOB, null, null, true, conf)) {

        if (currentChunkSize > chunkSizeLimit) {
          Closeables.close(freqWriter, false);
          chunkIndex++;

          chunkPath = new Path(dictionaryPathBase, FREQUENCY_FILE + chunkIndex);
          chunkPaths.add(chunkPath);

          freqWriter =
              new SequenceFile.Writer(fs, conf, chunkPath, IntWritable.class, LongWritable.class);
          currentChunkSize = 0;
        }

        int fieldSize = SEQUENCEFILE_BYTE_OVERHEAD + Integer.SIZE / 8 + Long.SIZE / 8;
        currentChunkSize += fieldSize;
        IntWritable key = record.getFirst();
        LongWritable value = record.getSecond();
        if (key.get() >= 0) {
          freqWriter.append(key, value);
        } else if (key.get() == -1) {
          vectorCount = value.get();
        }
        featureCount = Math.max(key.get(), featureCount);
      }
      featureCount++;
      Long[] counts = {featureCount, vectorCount};
      return new Pair<Long[], List<Path>>(counts, chunkPaths);
    } finally {
      Closeables.close(freqWriter, false);
    }
  }
Esempio n. 25
0
  public static Job createTimesSquaredJob(
      Configuration initialConf,
      Vector v,
      int outputVectorDim,
      Path matrixInputPath,
      Path outputVectorPathBase,
      Class<? extends TimesSquaredMapper> mapClass,
      Class<? extends VectorSummingReducer> redClass)
      throws IOException {

    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), initialConf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    outputVectorPathBase = fs.makeQualified(outputVectorPathBase);

    long now = System.nanoTime();
    Path inputVectorPath = new Path(outputVectorPathBase, INPUT_VECTOR + '/' + now);

    SequenceFile.Writer inputVectorPathWriter = null;

    try {
      inputVectorPathWriter =
          new SequenceFile.Writer(
              fs, initialConf, inputVectorPath, NullWritable.class, VectorWritable.class);
      inputVectorPathWriter.append(NullWritable.get(), new VectorWritable(v));
    } finally {
      Closeables.close(inputVectorPathWriter, false);
    }

    URI ivpURI = inputVectorPath.toUri();
    DistributedCache.setCacheFiles(new URI[] {ivpURI}, initialConf);

    Job job =
        HadoopUtil.prepareJob(
            matrixInputPath,
            new Path(outputVectorPathBase, OUTPUT_VECTOR_FILENAME),
            SequenceFileInputFormat.class,
            mapClass,
            NullWritable.class,
            VectorWritable.class,
            redClass,
            NullWritable.class,
            VectorWritable.class,
            SequenceFileOutputFormat.class,
            initialConf);
    job.setCombinerClass(redClass);
    job.setJobName("TimesSquaredJob: " + matrixInputPath);

    Configuration conf = job.getConfiguration();
    conf.set(INPUT_VECTOR, ivpURI.toString());
    conf.setBoolean(IS_SPARSE_OUTPUT, !v.isDense());
    conf.setInt(OUTPUT_VECTOR_DIMENSION, outputVectorDim);

    return job;
  }
  private static Path saveVector(Configuration conf, Path path, Vector v) throws IOException {
    FileSystem fs = path.getFileSystem(conf);
    SequenceFile.Writer writer =
        new SequenceFile.Writer(fs, conf, path, IntWritable.class, VectorWritable.class);

    try {
      writer.append(new IntWritable(0), new VectorWritable(v));
    } finally {
      writer.close();
    }
    return path;
  }
Esempio n. 27
0
 private void write(Object part, int id, Vector vector) throws IOException {
   SequenceFile.Writer writer = writers.get(part);
   if (writer == null) {
     Configuration conf = UDFContext.getUDFContext().getJobConf();
     Path file = PathUtils.enter(getStorePath(), String.valueOf(part), "part-" + Env.getPartID());
     writer = IOUtils.forSequenceWrite(conf, file, IntWritable.class, VectorWritable.class);
     writers.put(part, writer);
   }
   keyWritable.set(id);
   valueWritable.set(vector);
   writer.append(keyWritable, valueWritable);
 }
Esempio n. 28
0
 private static void writeClassifier(
     ClusterClassifier classifier, Configuration config, Path path, FileSystem fs)
     throws IOException {
   SequenceFile.Writer writer =
       new SequenceFile.Writer(fs, config, path, Text.class, ClusterClassifier.class);
   Writable key = new Text("test");
   try {
     writer.append(key, classifier);
   } finally {
     Closeables.closeQuietly(writer);
   }
 }
Esempio n. 29
0
 /** Reduce task done, write output to a file. */
 @Override
 public void close() throws IOException {
   // write output to a file
   Path outDir = new Path(TMP_DIR, "out");
   Path outFile = new Path(outDir, "reduce-out");
   FileSystem fileSys = FileSystem.get(conf);
   SequenceFile.Writer writer =
       SequenceFile.createWriter(
           fileSys, conf, outFile, LongWritable.class, LongWritable.class, CompressionType.NONE);
   writer.append(new LongWritable(numInside), new LongWritable(numOutside));
   writer.close();
 }
  public static void toSequenceFile(String fileName, Collection<String> pdbIds, boolean verbose)
      throws IOException {

    int failure = 0;
    int success = 0;
    int chains = 0;

    try (SequenceFile.Writer writer =
        SequenceFile.createWriter(
            new Configuration(),
            SequenceFile.Writer.file(new Path(fileName)),
            SequenceFile.Writer.keyClass(Text.class),
            SequenceFile.Writer.valueClass(IntArrayWritable.class),
            SequenceFile.Writer.compression(
                SequenceFile.CompressionType.BLOCK, new BZip2Codec())); ) {
      for (String pdbId : pdbIds) {
        if (verbose) {
          System.out.println(pdbId);
        }

        Structure s = null;
        try {
          s = StructureIO.getStructure(pdbId);
          success++;
        } catch (Exception e) {
          // some files can't be read. Let's just skip those!
          e.printStackTrace();
          failure++;
          continue;
        }

        if (s == null) {
          System.err.println("structure null: " + pdbId);
          continue;
        }

        if (s.getChains().size() == 0) {
          continue;
        }

        chains += append(writer, pdbId, s);
      }
      IOUtils.closeStream(writer);
    }

    if (verbose) {
      System.out.println("Total structures: " + pdbIds.size());
      System.out.println("Success: " + success);
      System.out.println("Failure: " + failure);
      System.out.println("Chains: " + chains);
    }
  }