Exemple #1
0
 private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) {
   try {
     if (fs == null) return;
     for (FileStatus file : fs.listStatus(p)) {
       Path pfs = file.getPath();
       if (file.isDir()) {
         addFolder(fs, pfs, succeeded, failed);
       } else {
         Key k = Key.make(pfs.toString());
         long size = file.getLen();
         Value val = null;
         if (pfs.getName().endsWith(Extensions.JSON)) {
           JsonParser parser = new JsonParser();
           JsonObject json = parser.parse(new InputStreamReader(fs.open(pfs))).getAsJsonObject();
           JsonElement v = json.get(Constants.VERSION);
           if (v == null) throw new RuntimeException("Missing version");
           JsonElement type = json.get(Constants.TYPE);
           if (type == null) throw new RuntimeException("Missing type");
           Class c = Class.forName(type.getAsString());
           OldModel model = (OldModel) c.newInstance();
           model.fromJson(json);
         } else if (pfs.getName().endsWith(Extensions.HEX)) { // Hex file?
           FSDataInputStream s = fs.open(pfs);
           int sz = (int) Math.min(1L << 20, size); // Read up to the 1st meg
           byte[] mem = MemoryManager.malloc1(sz);
           s.readFully(mem);
           // Convert to a ValueArray (hope it fits in 1Meg!)
           ValueArray ary = new ValueArray(k, 0).read(new AutoBuffer(mem));
           val = new Value(k, ary, Value.HDFS);
         } else if (size >= 2 * ValueArray.CHUNK_SZ) {
           val =
               new Value(
                   k,
                   new ValueArray(k, size),
                   Value.HDFS); // ValueArray byte wrapper over a large file
         } else {
           val = new Value(k, (int) size, Value.HDFS); // Plain Value
           val.setdsk();
         }
         DKV.put(k, val);
         Log.info("PersistHdfs: DKV.put(" + k + ")");
         JsonObject o = new JsonObject();
         o.addProperty(Constants.KEY, k.toString());
         o.addProperty(Constants.FILE, pfs.toString());
         o.addProperty(Constants.VALUE_SIZE, file.getLen());
         succeeded.add(o);
       }
     }
   } catch (Exception e) {
     Log.err(e);
     JsonObject o = new JsonObject();
     o.addProperty(Constants.FILE, p.toString());
     o.addProperty(Constants.ERROR, e.getMessage());
     failed.add(o);
   }
 }
Exemple #2
0
 static {
   InputStream resource = Boot._init.getResource2("/page.html");
   try {
     _htmlTemplate =
         new String(ByteStreams.toByteArray(resource)).replace("%cloud_name", H2O.NAME);
   } catch (NullPointerException e) {
     Log.err(e);
     Log.die("page.html not found in resources.");
   } catch (Exception e) {
     Log.err(e);
     Log.die(e.getMessage());
   } finally {
     Closeables.closeQuietly(resource);
   }
 }
Exemple #3
0
 @Override
 public void reduce(DRemoteTask drt) {
   try {
     DParseTask other = (DParseTask) drt;
     if (_sigma == null) _sigma = other._sigma;
     if (_invalidValues == null) {
       _enums = other._enums;
       _min = other._min;
       _max = other._max;
       _mean = other._mean;
       _sigma = other._sigma;
       _scale = other._scale;
       _colTypes = other._colTypes;
       _nrows = other._nrows;
       _invalidValues = other._invalidValues;
     } else {
       if (_phase == Pass.ONE) {
         if (_nrows != other._nrows)
           for (int i = 0; i < _nrows.length; ++i) _nrows[i] += other._nrows[i];
         for (int i = 0; i < _ncolumns; ++i) {
           if (_enums[i] != other._enums[i]) _enums[i].merge(other._enums[i]);
           if (other._min[i] < _min[i]) _min[i] = other._min[i];
           if (other._max[i] > _max[i]) _max[i] = other._max[i];
           if (other._scale[i] < _scale[i]) _scale[i] = other._scale[i];
           if (other._colTypes[i] > _colTypes[i]) _colTypes[i] = other._colTypes[i];
           _mean[i] += other._mean[i];
         }
       } else if (_phase == Pass.TWO) {
         for (int i = 0; i < _ncolumns; ++i) _sigma[i] += other._sigma[i];
       } else assert false : "unexpected _phase value:" + _phase;
       for (int i = 0; i < _ncolumns; ++i) _invalidValues[i] += other._invalidValues[i];
     }
     _myrows += other._myrows;
     if (_error == null) _error = other._error;
     else if (other._error != null) _error = _error + "\n" + other._error;
   } catch (Exception e) {
     e.printStackTrace();
   }
 }
Exemple #4
0
  /**
   * Map function for distributed parsing of the CSV files.
   *
   * <p>In first phase it calculates the min, max, means, encodings and other statistics about the
   * dataset, determines the number of columns.
   *
   * <p>The second pass then encodes the parsed dataset to the result key, splitting it into equal
   * sized chunks.
   */
  @Override
  public void map(Key key) {
    try {
      Key aryKey = null;
      boolean arraylet = key._kb[0] == Key.ARRAYLET_CHUNK;
      boolean skipFirstLine = _skipFirstLine;
      if (arraylet) {
        aryKey = ValueArray.getArrayKey(key);
        _chunkId = ValueArray.getChunkIndex(key);
        skipFirstLine = skipFirstLine || (ValueArray.getChunkIndex(key) != 0);
      }
      switch (_phase) {
        case ONE:
          assert (_ncolumns != 0);
          // initialize the column statistics
          phaseOneInitialize();
          // perform the parse
          CsvParser p = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p.parse(key);
          if (arraylet) {
            long idx = ValueArray.getChunkIndex(key);
            int idx2 = (int) idx;
            assert idx2 == idx;
            assert (_nrows[idx2] == 0)
                : idx
                    + ": "
                    + Arrays.toString(_nrows)
                    + " ("
                    + _nrows[idx2]
                    + " -- "
                    + _myrows
                    + ")";
            _nrows[idx2] = _myrows;
          }
          break;
        case TWO:
          assert (_ncolumns != 0);
          // initialize statistics - invalid rows, sigma and row size
          phaseTwoInitialize();
          // calculate the first row and the number of rows to parse
          int firstRow = 0;
          int lastRow = _myrows;
          _myrows = 0;
          if (arraylet) {
            long origChunkIdx = ValueArray.getChunkIndex(key);
            firstRow = (origChunkIdx == 0) ? 0 : _nrows[(int) origChunkIdx - 1];
            lastRow = _nrows[(int) origChunkIdx];
          }
          int rowsToParse = lastRow - firstRow;
          // create the output streams
          _outputStreams2 = createRecords(firstRow, rowsToParse);
          assert (_outputStreams2.length > 0);
          _ab = _outputStreams2[0].initialize();
          // perform the second parse pass
          CsvParser p2 = new CsvParser(aryKey, _ncolumns, _sep, _decSep, this, skipFirstLine);
          p2.parse(key);
          // store the last stream if not stored during the parse
          if (_ab != null) _outputStreams2[_outputIdx].store();
          break;
        default:
          assert (false);
      }

      ParseStatus.update(_resultKey, DKV.get(key).length(), _phase);
    } catch (Exception e) {
      e.printStackTrace();
      _error = e.getMessage();
    }
  }